From 3dbdb010eb4d3ca46ab58ae174fc3e4488821b17 Mon Sep 17 00:00:00 2001 From: Ivan Zubenko Date: Thu, 12 Dec 2024 11:01:10 +0200 Subject: [PATCH] ENG-504 update cluster, node pool patch methods (#139) --- neuro_config_client/__init__.py | 20 +- neuro_config_client/client.py | 100 ++------ neuro_config_client/entities.py | 171 +++++++++---- neuro_config_client/factories.py | 268 +++++++++++++------- tests/test_factories.py | 415 ++++++++++++++++++++++--------- 5 files changed, 630 insertions(+), 344 deletions(-) diff --git a/neuro_config_client/__init__.py b/neuro_config_client/__init__.py index 2902ae9..9d2ef21 100644 --- a/neuro_config_client/__init__.py +++ b/neuro_config_client/__init__.py @@ -5,16 +5,15 @@ from .client import ConfigClient, ConfigClientBase from .entities import ( ACMEEnvironment, + AddNodePoolRequest, ARecord, AWSCloudProvider, AWSCredentials, AWSStorage, - AWSStorageOptions, AzureCloudProvider, AzureCredentials, AzureReplicationType, AzureStorage, - AzureStorageOptions, AzureStorageTier, BucketsConfig, CloudProvider, @@ -36,7 +35,6 @@ GoogleCloudProvider, GoogleFilestoreTier, GoogleStorage, - GoogleStorageOptions, GrafanaCredentials, HelmRegistryConfig, IdleJobConfig, @@ -52,6 +50,11 @@ OnPremCloudProvider, OpenStackCredentials, OrchestratorConfig, + PatchClusterRequest, + PatchNodePoolResourcesRequest, + PatchNodePoolSizeRequest, + PatchOrchestratorConfigRequest, + PutNodePoolRequest, RegistryConfig, ResourcePoolType, ResourcePreset, @@ -61,7 +64,6 @@ Storage, StorageConfig, StorageInstance, - StorageOptions, TPUPreset, TPUResource, VCDCloudProvider, @@ -75,16 +77,15 @@ "ConfigClient", "ConfigClientBase", "ACMEEnvironment", + "AddNodePoolRequest", "ARecord", "AWSCloudProvider", "AWSCredentials", "AWSStorage", - "AWSStorageOptions", "AzureCloudProvider", "AzureCredentials", "AzureReplicationType", "AzureStorage", - "AzureStorageOptions", "AzureStorageTier", "BucketsConfig", "CloudProvider", @@ -106,7 +107,6 @@ "GoogleCloudProvider", "GoogleFilestoreTier", "GoogleStorage", - "GoogleStorageOptions", "GrafanaCredentials", "HelmRegistryConfig", "IdleJobConfig", @@ -122,6 +122,11 @@ "OnPremCloudProvider", "OpenStackCredentials", "OrchestratorConfig", + "PatchClusterRequest", + "PatchNodePoolResourcesRequest", + "PatchNodePoolSizeRequest", + "PatchOrchestratorConfigRequest", + "PutNodePoolRequest", "RegistryConfig", "ResourcePoolType", "ResourcePreset", @@ -131,7 +136,6 @@ "Storage", "StorageConfig", "StorageInstance", - "StorageOptions", "TPUPreset", "TPUResource", "VCDCloudProvider", diff --git a/neuro_config_client/client.py b/neuro_config_client/client.py index 540089d..9d09ebb 100644 --- a/neuro_config_client/client.py +++ b/neuro_config_client/client.py @@ -2,7 +2,6 @@ import abc import logging -import sys from collections.abc import AsyncIterator, Mapping, Sequence from contextlib import AbstractAsyncContextManager, asynccontextmanager from dataclasses import dataclass @@ -14,33 +13,20 @@ from yarl import URL from .entities import ( - BucketsConfig, + AddNodePoolRequest, CloudProviderOptions, CloudProviderType, Cluster, - CredentialsConfig, - DisksConfig, - DNSConfig, - EnergyConfig, - IngressConfig, - MetricsConfig, - MonitoringConfig, NodePool, NotificationType, - OrchestratorConfig, - RegistryConfig, + PatchClusterRequest, + PatchNodePoolResourcesRequest, + PatchNodePoolSizeRequest, + PutNodePoolRequest, ResourcePreset, - SecretsConfig, - StorageConfig, ) from .factories import EntityFactory, PayloadFactory -if sys.version_info >= (3, 9): - from zoneinfo import ZoneInfo -else: - # why not backports.zoneinfo: https://github.com/pganssle/zoneinfo/issues/125 - from backports.zoneinfo._zoneinfo import ZoneInfo - logger = logging.getLogger(__name__) @@ -170,56 +156,10 @@ async def create_blank_cluster( return await self.get_cluster(name) async def patch_cluster( - self, - name: str, - *, - credentials: CredentialsConfig | None = None, - storage: StorageConfig | None = None, - registry: RegistryConfig | None = None, - orchestrator: OrchestratorConfig | None = None, - monitoring: MonitoringConfig | None = None, - secrets: SecretsConfig | None = None, - metrics: MetricsConfig | None = None, - disks: DisksConfig | None = None, - buckets: BucketsConfig | None = None, - ingress: IngressConfig | None = None, - dns: DNSConfig | None = None, - timezone: ZoneInfo | None = None, - energy: EnergyConfig | None = None, - token: str | None = None, + self, name: str, request: PatchClusterRequest, *, token: str | None = None ) -> Cluster: path = self._endpoints.cluster(name) - payload: dict[str, Any] = {} - if credentials: - payload["credentials"] = self._payload_factory.create_credentials( - credentials - ) - if storage: - payload["storage"] = self._payload_factory.create_storage(storage) - if registry: - payload["registry"] = self._payload_factory.create_registry(registry) - if orchestrator: - payload["orchestrator"] = self._payload_factory.create_orchestrator( - orchestrator - ) - if monitoring: - payload["monitoring"] = self._payload_factory.create_monitoring(monitoring) - if secrets: - payload["secrets"] = self._payload_factory.create_secrets(secrets) - if metrics: - payload["metrics"] = self._payload_factory.create_metrics(metrics) - if disks: - payload["disks"] = self._payload_factory.create_disks(disks) - if buckets: - payload["buckets"] = self._payload_factory.create_buckets(buckets) - if ingress: - payload["ingress"] = self._payload_factory.create_ingress(ingress) - if dns: - payload["dns"] = self._payload_factory.create_dns(dns) - if timezone: - payload["timezone"] = str(timezone) - if energy: - payload["energy"] = self._payload_factory.create_energy(energy) + payload = self._payload_factory.create_patch_cluster_request(request) async with self._request( "PATCH", path, headers=self._create_headers(token=token), json=payload ) as resp: @@ -337,7 +277,7 @@ async def list_node_pools( async def add_node_pool( self, cluster_name: str, - node_pool: NodePool, + node_pool: AddNodePoolRequest, *, start_deployment: bool = True, token: str | None = None, @@ -345,8 +285,9 @@ async def add_node_pool( """Add new node pool to the existing cluster. Cloud provider should be already set up. - Make sure you use one of the available node pool templates by providing its ID, - if the cluster is deployed in public cloud (AWS / GCP / Azure / VCD). + Make sure you use one of the available node pool templates by providing + its machine type, if the cluster is deployed in public cloud + (AWS / GCP / Azure / VCD). Args: cluster_name (str): Name of the cluster within the platform. @@ -358,7 +299,7 @@ async def add_node_pool( Cluster: Cluster instance with applied changes """ path = self._endpoints.node_pools(cluster_name) - payload = self._payload_factory.create_node_pool(node_pool) + payload = self._payload_factory.create_add_node_pool_request(node_pool) async with self._request( "POST", path, @@ -372,13 +313,13 @@ async def add_node_pool( async def put_node_pool( self, cluster_name: str, - node_pool: NodePool, + node_pool: PutNodePoolRequest, *, start_deployment: bool = True, token: str | None = None, ) -> Cluster: path = self._endpoints.node_pool(cluster_name, node_pool.name) - payload = self._payload_factory.create_node_pool(node_pool) + payload = self._payload_factory.create_add_node_pool_request(node_pool) async with self._request( "PUT", path, @@ -393,16 +334,19 @@ async def patch_node_pool( self, cluster_name: str, node_pool_name: str, + request: PatchNodePoolSizeRequest | PatchNodePoolResourcesRequest, *, - idle_size: int | None = None, + start_deployment: bool = True, token: str | None = None, ) -> Cluster: path = self._endpoints.node_pool(cluster_name, node_pool_name) - payload: dict[str, Any] = {} - if idle_size is not None: - payload["idle_size"] = idle_size + payload = self._payload_factory.create_patch_node_pool_request(request) async with self._request( - "PATCH", path, headers=self._create_headers(token=token), json=payload + "PATCH", + path, + params={"start_deployment": str(start_deployment).lower()}, + headers=self._create_headers(token=token), + json=payload, ) as response: resp_payload = await response.json() return self._entity_factory.create_cluster(resp_payload) diff --git a/neuro_config_client/entities.py b/neuro_config_client/entities.py index 7a53e68..39b1f1b 100644 --- a/neuro_config_client/entities.py +++ b/neuro_config_client/entities.py @@ -42,7 +42,6 @@ def is_vcd(self) -> bool: class CloudProviderOptions: type: CloudProviderType node_pools: list[NodePoolOptions] - storages: list[StorageOptions] @dataclass(frozen=True) @@ -59,77 +58,86 @@ class VCDCloudProviderOptions(CloudProviderOptions): @dataclass(frozen=True) class NodePoolOptions: - id: str machine_type: str cpu: float - available_cpu: float memory: int - available_memory: int - gpu: int | None = None - gpu_model: str | None = None + available_cpu: float | None = None + available_memory: int | None = None + nvidia_gpu: int | None = None + nvidia_gpu_model: str | None = None -@dataclass(frozen=True) -class StorageOptions: - id: str +class NodeRole(str, enum.Enum): + KUBERNETES = "kubernetes" + PLATFORM = "platform" + PLATFORM_JOB = "platform_job" @dataclass(frozen=True) -class GoogleStorageOptions(StorageOptions): - tier: GoogleFilestoreTier - min_capacity: int - max_capacity: int +class NodePool: + name: str + cpu: float + available_cpu: float + memory: int + available_memory: int + disk_size: int + available_disk_size: int -@dataclass(frozen=True) -class AWSStorageOptions(StorageOptions): - performance_mode: EFSPerformanceMode - throughput_mode: EFSThroughputMode - provisioned_throughput_mibps: int | None = None + role: NodeRole = NodeRole.PLATFORM_JOB + min_size: int = 0 + max_size: int = 1 + idle_size: int | None = None -@dataclass(frozen=True) -class AzureStorageOptions(StorageOptions): - tier: AzureStorageTier - replication_type: AzureReplicationType - min_file_share_size: int - max_file_share_size: int + machine_type: str | None = None + disk_type: str | None = None -class NodeRole(str, enum.Enum): - KUBERNETES = "kubernetes" - PLATFORM = "platform" - PLATFORM_JOB = "platform_job" + nvidia_gpu: int | None = None + nvidia_gpu_model: str | None = None + amd_gpu: int | None = None + amd_gpu_model: str | None = None + intel_gpu: int | None = None + intel_gpu_model: str | None = None + + price: Decimal | None = None + currency: str | None = None + + is_preemptible: bool | None = None + + zones: tuple[str, ...] | None = None + + cpu_min_watts: float = 0.0 + cpu_max_watts: float = 0.0 @dataclass(frozen=True) -class NodePool: +class AddNodePoolRequest: name: str - id: str | None = None - role: NodeRole = NodeRole.PLATFORM_JOB - min_size: int = 0 - max_size: int = 1 + min_size: int + max_size: int idle_size: int | None = None + role: NodeRole = NodeRole.PLATFORM_JOB + machine_type: str | None = None + cpu: float | None = None available_cpu: float | None = None memory: int | None = None available_memory: int | None = None - - disk_size: int | None = None disk_type: str | None = None + disk_size: int | None = None + available_disk_size: int | None = None nvidia_gpu: int | None = None - amd_gpu: int | None = None - intel_gpu: int | None = None nvidia_gpu_model: str | None = None + amd_gpu: int | None = None amd_gpu_model: str | None = None + intel_gpu: int | None = None intel_gpu_model: str | None = None - # todo: two props below are already deprecated - gpu: int | None = None - gpu_model: str | None = None price: Decimal | None = None currency: str | None = None @@ -138,8 +146,40 @@ class NodePool: zones: tuple[str, ...] | None = None - cpu_min_watts: float = 0.0 - cpu_max_watts: float = 0.0 + cpu_min_watts: float | None = None + cpu_max_watts: float | None = None + + +PutNodePoolRequest = AddNodePoolRequest + + +@dataclass(frozen=True) +class PatchNodePoolSizeRequest: + min_size: int | None = None + max_size: int | None = None + idle_size: int | None = None + + +@dataclass(frozen=True) +class PatchNodePoolResourcesRequest: + cpu: float + available_cpu: float + memory: int + available_memory: int + disk_size: int + available_disk_size: int + + nvidia_gpu: int | None = None + nvidia_gpu_model: str | None = None + amd_gpu: int | None = None + amd_gpu_model: str | None = None + intel_gpu: int | None = None + intel_gpu_model: str | None = None + + machine_type: str | None = None + + min_size: int | None = None + max_size: int | None = None @dataclass(frozen=True) @@ -183,7 +223,6 @@ class EFSThroughputMode(str, enum.Enum): @dataclass(frozen=True) class AWSStorage(Storage): - id: str description: str performance_mode: EFSPerformanceMode throughput_mode: EFSThroughputMode @@ -215,7 +254,6 @@ class GoogleFilestoreTier(str, enum.Enum): @dataclass(frozen=True) class GoogleStorage(Storage): - id: str description: str tier: GoogleFilestoreTier @@ -254,7 +292,6 @@ class AzureReplicationType(str, enum.Enum): @dataclass(frozen=True) class AzureStorage(Storage): - id: str description: str tier: AzureStorageTier replication_type: AzureReplicationType @@ -494,10 +531,11 @@ class ResourcePoolType: idle_size: int = 0 cpu: float = 1.0 - available_cpu: float = 1.0 # TODO: deprecated, use cpu instead + available_cpu: float = 1.0 memory: int = 2**30 # 1gb - available_memory: int = 2**30 # TODO: deprecated, use memory instead + available_memory: int = 2**30 disk_size: int = 150 * 2**30 # 150gb + available_disk_size: int = 150 * 2**30 # 150gb nvidia_gpu: int | None = None amd_gpu: int | None = None @@ -518,9 +556,11 @@ class ResourcePoolType: @dataclass(frozen=True) class Resources: - cpu_m: int + cpu: float memory: int - gpu: int = 0 + nvidia_gpu: int = 0 + amd_gpu: int = 0 + intel_gpu: int = 0 @dataclass(frozen=True) @@ -552,6 +592,22 @@ class OrchestratorConfig: idle_jobs: Sequence[IdleJobConfig] = () +@dataclass +class PatchOrchestratorConfigRequest: + job_hostname_template: str | None = None + job_internal_hostname_template: str | None = None + job_fallback_hostname: str | None = None + job_schedule_timeout_s: float | None = None + job_schedule_scale_up_timeout_s: float | None = None + is_http_ingress_secure: bool | None = None + resource_pool_types: Sequence[ResourcePoolType] | None = None + resource_presets: Sequence[ResourcePreset] | None = None + allow_privileged_mode: bool | None = None + allow_job_priority: bool | None = None + pre_pull_images: Sequence[str] | None = None + idle_jobs: Sequence[IdleJobConfig] | None = None + + @dataclass class ARecord: name: str @@ -618,3 +674,20 @@ class Cluster: buckets: BucketsConfig | None = None ingress: IngressConfig | None = None energy: EnergyConfig | None = None + + +@dataclass(frozen=True) +class PatchClusterRequest: + credentials: CredentialsConfig | None = None + storage: StorageConfig | None = None + registry: RegistryConfig | None = None + orchestrator: PatchOrchestratorConfigRequest | None = None + monitoring: MonitoringConfig | None = None + secrets: SecretsConfig | None = None + metrics: MetricsConfig | None = None + disks: DisksConfig | None = None + buckets: BucketsConfig | None = None + ingress: IngressConfig | None = None + dns: DNSConfig | None = None + timezone: ZoneInfo | None = None + energy: EnergyConfig | None = None diff --git a/neuro_config_client/factories.py b/neuro_config_client/factories.py index 925ec2c..ec9ccb3 100644 --- a/neuro_config_client/factories.py +++ b/neuro_config_client/factories.py @@ -9,16 +9,15 @@ from .entities import ( ACMEEnvironment, + AddNodePoolRequest, ARecord, AWSCloudProvider, AWSCredentials, AWSStorage, - AWSStorageOptions, AzureCloudProvider, AzureCredentials, AzureReplicationType, AzureStorage, - AzureStorageOptions, AzureStorageTier, BucketsConfig, CloudProvider, @@ -40,7 +39,6 @@ GoogleCloudProvider, GoogleFilestoreTier, GoogleStorage, - GoogleStorageOptions, GrafanaCredentials, HelmRegistryConfig, IdleJobConfig, @@ -56,6 +54,10 @@ OnPremCloudProvider, OpenStackCredentials, OrchestratorConfig, + PatchClusterRequest, + PatchNodePoolResourcesRequest, + PatchNodePoolSizeRequest, + PatchOrchestratorConfigRequest, RegistryConfig, ResourcePoolType, ResourcePreset, @@ -64,7 +66,6 @@ SentryCredentials, StorageConfig, StorageInstance, - StorageOptions, TPUPreset, TPUResource, VCDCloudProvider, @@ -93,9 +94,6 @@ def create_cloud_provider_options( node_pools=[ cls.create_node_pool_options(p) for p in payload.get("node_pools", ()) ], - storages=[ - cls.create_storage_options(type, p) for p in payload.get("storages", ()) - ], ) @classmethod @@ -108,9 +106,6 @@ def _create_vcd_cloud_provider_options( node_pools=[ cls.create_node_pool_options(p) for p in payload.get("node_pools", ()) ], - storages=[ - cls.create_storage_options(type, p) for p in payload.get("storages", ()) - ], kubernetes_node_pool_id=payload["kubernetes_node_pool_id"], platform_node_pool_id=payload["platform_node_pool_id"], url=URL(url) if url else None, @@ -124,57 +119,15 @@ def _create_vcd_cloud_provider_options( @staticmethod def create_node_pool_options(payload: dict[str, Any]) -> NodePoolOptions: return NodePoolOptions( - id=payload["id"], machine_type=payload["machine_type"], cpu=payload["cpu"], - available_cpu=payload["available_cpu"], + available_cpu=payload.get("available_cpu"), memory=payload["memory"], - available_memory=payload["available_memory"], - gpu=payload.get("gpu"), - gpu_model=payload.get("gpu_model"), - ) - - @classmethod - def create_storage_options( - cls, type: CloudProviderType, payload: dict[str, Any] - ) -> StorageOptions: - if type == CloudProviderType.AWS: - return cls.create_aws_storage_options(payload) - elif type == CloudProviderType.GCP: - return cls.create_google_storage_options(payload) - elif type == CloudProviderType.AZURE: - return cls.create_azure_storage_options(payload) - else: - raise ValueError( - f"Storage options are not supported for {type.value!r} cloud provider" - ) - - @staticmethod - def create_aws_storage_options(payload: dict[str, Any]) -> AWSStorageOptions: - return AWSStorageOptions( - id=payload["id"], - performance_mode=EFSPerformanceMode(payload["performance_mode"]), - throughput_mode=EFSThroughputMode(payload["throughput_mode"]), - provisioned_throughput_mibps=payload.get("provisioned_throughput_mibps"), - ) - - @staticmethod - def create_google_storage_options(payload: dict[str, Any]) -> GoogleStorageOptions: - return GoogleStorageOptions( - id=payload["id"], - tier=GoogleFilestoreTier(payload["tier"]), - min_capacity=payload["min_capacity"], - max_capacity=payload["max_capacity"], - ) - - @staticmethod - def create_azure_storage_options(payload: dict[str, Any]) -> AzureStorageOptions: - return AzureStorageOptions( - id=payload["id"], - tier=AzureStorageTier(payload["tier"]), - replication_type=AzureReplicationType(payload["replication_type"]), - min_file_share_size=payload["min_file_share_size"], - max_file_share_size=payload["max_file_share_size"], + available_memory=payload.get("available_memory"), + nvidia_gpu=payload.get("nvidia_gpu") or payload.get("gpu"), + nvidia_gpu_model=( + payload.get("nvidia_gpu_model") or payload.get("gpu_model") + ), ) def create_cluster(self, payload: dict[str, Any]) -> Cluster: @@ -330,7 +283,11 @@ def create_idle_job(self, payload: dict[str, Any]) -> IdleJobConfig: def create_resources(self, payload: dict[str, Any]) -> Resources: return Resources( - cpu_m=payload["cpu_m"], memory=payload["memory"], gpu=payload.get("gpu", 0) + cpu=payload["cpu"], + memory=payload["memory"], + nvidia_gpu=payload.get("nvidia_gpu", 0), + amd_gpu=payload.get("amd_gpu", 0), + intel_gpu=payload.get("intel_gpu", 0), ) def create_storage(self, payload: dict[str, Any]) -> StorageConfig: @@ -427,26 +384,28 @@ def _create_aws_cloud_provider(self, payload: dict[str, Any]) -> CloudProvider: ) def create_node_pool(self, payload: dict[str, Any]) -> NodePool: - price = Decimal(payload["price"]) if payload.get("price") else NodePool.price + price_value = payload.get("price") + price = Decimal(price_value) if price_value is not None else NodePool.price + disk_size = payload.get("disk_size", 0) return NodePool( name=payload["name"], - id=payload.get("id"), role=NodeRole(payload["role"]), min_size=payload["min_size"], max_size=payload["max_size"], - cpu=payload.get("cpu"), - available_cpu=payload.get("available_cpu"), - memory=payload.get("memory"), - available_memory=payload.get("available_memory"), - disk_size=payload.get("disk_size", NodePool.disk_size), + cpu=payload["cpu"], + available_cpu=payload["available_cpu"], + memory=payload["memory"], + available_memory=payload["available_memory"], + disk_size=disk_size, + available_disk_size=payload.get("available_disk_size", disk_size), disk_type=payload.get("disk_type", NodePool.disk_type), nvidia_gpu=payload.get("nvidia_gpu") or payload.get("gpu"), - amd_gpu=payload.get("amd_gpu"), - intel_gpu=payload.get("intel_gpu"), nvidia_gpu_model=( payload.get("nvidia_gpu_model") or payload.get("gpu_model") ), + amd_gpu=payload.get("amd_gpu"), amd_gpu_model=payload.get("amd_gpu_model"), + intel_gpu=payload.get("intel_gpu"), intel_gpu_model=payload.get("intel_gpu_model"), price=price, currency=payload.get("currency", NodePool.currency), @@ -460,7 +419,6 @@ def create_node_pool(self, payload: dict[str, Any]) -> NodePool: def _create_aws_storage(self, payload: dict[str, Any]) -> AWSStorage: result = AWSStorage( - id=payload["id"], description=payload["description"], performance_mode=EFSPerformanceMode(payload["performance_mode"]), throughput_mode=EFSThroughputMode(payload["throughput_mode"]), @@ -482,7 +440,6 @@ def _create_google_cloud_provider(self, payload: dict[str, Any]) -> CloudProvide def _create_google_storage(self, payload: dict[str, Any]) -> GoogleStorage: result = GoogleStorage( - id=payload["id"], description=payload["description"], tier=GoogleFilestoreTier(payload["tier"]), instances=[self._create_storage_instance(p) for p in payload["instances"]], @@ -507,7 +464,6 @@ def _create_azure_cloud_provider(self, payload: dict[str, Any]) -> CloudProvider def _create_azure_storage(self, payload: dict[str, Any]) -> AzureStorage: result = AzureStorage( - id=payload["id"], description=payload["description"], replication_type=AzureReplicationType(payload["replication_type"]), tier=AzureStorageTier(payload["tier"]), @@ -713,6 +669,41 @@ def _create_energy_schedule_period( class PayloadFactory: + @classmethod + def create_patch_cluster_request( + cls, request: PatchClusterRequest + ) -> dict[str, Any]: + payload: dict[str, Any] = {} + if request.credentials: + payload["credentials"] = cls.create_credentials(request.credentials) + if request.storage: + payload["storage"] = cls.create_storage(request.storage) + if request.registry: + payload["registry"] = cls.create_registry(request.registry) + if request.orchestrator: + payload["orchestrator"] = cls.create_patch_orchestrator_request( + request.orchestrator + ) + if request.monitoring: + payload["monitoring"] = cls.create_monitoring(request.monitoring) + if request.secrets: + payload["secrets"] = cls.create_secrets(request.secrets) + if request.metrics: + payload["metrics"] = cls.create_metrics(request.metrics) + if request.disks: + payload["disks"] = cls.create_disks(request.disks) + if request.buckets: + payload["buckets"] = cls.create_buckets(request.buckets) + if request.ingress: + payload["ingress"] = cls.create_ingress(request.ingress) + if request.dns: + payload["dns"] = cls.create_dns(request.dns) + if request.timezone: + payload["timezone"] = str(request.timezone) + if request.energy: + payload["energy"] = cls.create_energy(request.energy) + return payload + @classmethod def create_credentials(cls, credentials: CredentialsConfig) -> dict[str, Any]: result = { @@ -873,6 +864,49 @@ def create_orchestrator(self, orchestrator: OrchestratorConfig) -> dict[str, Any ] return result + @classmethod + def create_patch_orchestrator_request( + cls, orchestrator: PatchOrchestratorConfigRequest + ) -> dict[str, Any]: + payload: dict[str, Any] = {} + if orchestrator.job_hostname_template: + payload["job_hostname_template"] = orchestrator.job_hostname_template + if orchestrator.job_internal_hostname_template: + payload["job_internal_hostname_template"] = ( + orchestrator.job_internal_hostname_template + ) + if orchestrator.is_http_ingress_secure is not None: + payload["is_http_ingress_secure"] = orchestrator.is_http_ingress_secure + if orchestrator.job_fallback_hostname: + payload["job_fallback_hostname"] = orchestrator.job_fallback_hostname + if orchestrator.job_schedule_timeout_s is not None: + payload["job_schedule_timeout_s"] = orchestrator.job_schedule_timeout_s + if orchestrator.job_schedule_scale_up_timeout_s is not None: + payload["job_schedule_scale_up_timeout_s"] = ( + orchestrator.job_schedule_scale_up_timeout_s + ) + if orchestrator.allow_privileged_mode is not None: + payload["allow_privileged_mode"] = orchestrator.allow_privileged_mode + if orchestrator.allow_job_priority is not None: + payload["allow_job_priority"] = orchestrator.allow_job_priority + if orchestrator.resource_pool_types: + payload["resource_pool_types"] = [ + cls.create_resource_pool_type(r) + for r in orchestrator.resource_pool_types + ] + if orchestrator.resource_presets: + payload["resource_presets"] = [ + cls.create_resource_preset(preset) + for preset in orchestrator.resource_presets + ] + if orchestrator.pre_pull_images: + payload["pre_pull_images"] = orchestrator.pre_pull_images + if orchestrator.idle_jobs: + payload["idle_jobs"] = [ + cls._create_idle_job(job) for job in orchestrator.idle_jobs + ] + return payload + @classmethod def create_resource_pool_type( cls, resource_pool_type: ResourcePoolType @@ -888,6 +922,7 @@ def create_resource_pool_type( "memory": resource_pool_type.memory, "available_memory": resource_pool_type.available_memory, "disk_size": resource_pool_type.disk_size, + "available_disk_size": resource_pool_type.available_disk_size, } if resource_pool_type.nvidia_gpu: result["nvidia_gpu"] = resource_pool_type.nvidia_gpu @@ -970,9 +1005,13 @@ def _create_idle_job(cls, idle_job: IdleJobConfig) -> dict[str, Any]: @classmethod def _create_resources(cls, resources: Resources) -> dict[str, Any]: - result = {"cpu_m": resources.cpu_m, "memory": resources.memory} - if resources.gpu: - result["gpu"] = resources.gpu + result = {"cpu": resources.cpu, "memory": resources.memory} + if resources.nvidia_gpu: + result["nvidia_gpu"] = resources.nvidia_gpu + if resources.amd_gpu: + result["amd_gpu"] = resources.amd_gpu + if resources.intel_gpu: + result["intel_gpu"] = resources.intel_gpu return result @classmethod @@ -1028,15 +1067,15 @@ def create_ingress(cls, ingress: IngressConfig) -> dict[str, Any]: return result @classmethod - def create_node_pool(cls, node_pool: NodePool) -> dict[str, Any]: + def create_add_node_pool_request( + cls, node_pool: AddNodePoolRequest + ) -> dict[str, Any]: result: dict[str, Any] = { "name": node_pool.name, "role": node_pool.role.value, "min_size": node_pool.min_size, "max_size": node_pool.max_size, } - if node_pool.id: - result["id"] = node_pool.id if node_pool.idle_size: result["idle_size"] = node_pool.idle_size if node_pool.machine_type: @@ -1051,23 +1090,23 @@ def create_node_pool(cls, node_pool: NodePool) -> dict[str, Any]: result["available_memory"] = node_pool.available_memory if node_pool.disk_size: result["disk_size"] = node_pool.disk_size + if node_pool.available_disk_size: + result["available_disk_size"] = node_pool.available_disk_size if node_pool.disk_type: result["disk_type"] = node_pool.disk_type - nvidia_gpu = node_pool.nvidia_gpu or node_pool.gpu - if nvidia_gpu: - result["nvidia_gpu"] = nvidia_gpu + if node_pool.nvidia_gpu: + result["nvidia_gpu"] = node_pool.nvidia_gpu + if node_pool.nvidia_gpu_model: + result["nvidia_gpu_model"] = node_pool.nvidia_gpu_model if node_pool.amd_gpu: result["amd_gpu"] = node_pool.amd_gpu - if node_pool.intel_gpu: - result["intel_gpu"] = node_pool.intel_gpu - nvidia_gpu_model = node_pool.nvidia_gpu_model or node_pool.gpu_model - if nvidia_gpu_model: - result["nvidia_gpu_model"] = nvidia_gpu_model if node_pool.amd_gpu_model: result["amd_gpu_model"] = node_pool.amd_gpu_model + if node_pool.intel_gpu: + result["intel_gpu"] = node_pool.intel_gpu if node_pool.intel_gpu_model: result["intel_gpu_model"] = node_pool.intel_gpu_model - if node_pool.price: + if node_pool.price is not None: result["price"] = str(node_pool.price) if node_pool.currency: result["currency"] = node_pool.currency @@ -1075,12 +1114,65 @@ def create_node_pool(cls, node_pool: NodePool) -> dict[str, Any]: result["is_preemptible"] = node_pool.is_preemptible if node_pool.zones: result["zones"] = node_pool.zones - if node_pool.cpu_min_watts: + if node_pool.cpu_min_watts is not None: result["cpu_min_watts"] = node_pool.cpu_min_watts - if node_pool.cpu_max_watts: + if node_pool.cpu_max_watts is not None: result["cpu_max_watts"] = node_pool.cpu_max_watts return result + @classmethod + def create_patch_node_pool_request( + cls, request: PatchNodePoolSizeRequest | PatchNodePoolResourcesRequest + ) -> dict[str, Any]: + if isinstance(request, PatchNodePoolSizeRequest): + return cls._create_patch_node_pool_size_request(request) + elif isinstance(request, PatchNodePoolResourcesRequest): + return cls._create_patch_node_pool_resources_request(request) + msg = "Request type is not supported" + raise ValueError(msg) + + @classmethod + def _create_patch_node_pool_size_request( + cls, request: PatchNodePoolSizeRequest + ) -> dict[str, Any]: + payload: dict[str, Any] = {} + if request.min_size is not None: + payload["min_size"] = request.min_size + if request.max_size is not None: + payload["max_size"] = request.max_size + if request.idle_size is not None: + payload["idle_size"] = request.idle_size + return payload + + @classmethod + def _create_patch_node_pool_resources_request( + cls, request: PatchNodePoolResourcesRequest + ) -> dict[str, Any]: + payload: dict[str, Any] = { + "cpu": request.cpu, + "available_cpu": request.available_cpu, + "memory": request.memory, + "available_memory": request.available_memory, + "disk_size": request.disk_size, + "available_disk_size": request.available_disk_size, + } + if request.nvidia_gpu: + payload["nvidia_gpu"] = request.nvidia_gpu + payload["nvidia_gpu_model"] = request.nvidia_gpu_model + if request.amd_gpu: + payload["amd_gpu"] = request.amd_gpu + payload["amd_gpu_model"] = request.amd_gpu_model + if request.intel_gpu: + payload["intel_gpu"] = request.intel_gpu + payload["intel_gpu_model"] = request.intel_gpu_model + if request.machine_type: + payload["machine_type"] = request.machine_type + if request.min_size is not None: + payload["min_size"] = request.min_size + if request.max_size is not None: + payload["max_size"] = request.max_size + return payload + @classmethod def create_energy(cls, energy: EnergyConfig) -> dict[str, Any]: return { diff --git a/tests/test_factories.py b/tests/test_factories.py index b7b2df8..360cbfa 100644 --- a/tests/test_factories.py +++ b/tests/test_factories.py @@ -12,16 +12,15 @@ from neuro_config_client.entities import ( ACMEEnvironment, + AddNodePoolRequest, ARecord, AWSCloudProvider, AWSCredentials, AWSStorage, - AWSStorageOptions, AzureCloudProvider, AzureCredentials, AzureReplicationType, AzureStorage, - AzureStorageOptions, AzureStorageTier, BucketsConfig, CloudProviderOptions, @@ -42,7 +41,6 @@ GoogleCloudProvider, GoogleFilestoreTier, GoogleStorage, - GoogleStorageOptions, GrafanaCredentials, HelmRegistryConfig, IdleJobConfig, @@ -57,6 +55,10 @@ OnPremCloudProvider, OpenStackCredentials, OrchestratorConfig, + PatchClusterRequest, + PatchNodePoolResourcesRequest, + PatchNodePoolSizeRequest, + PatchOrchestratorConfigRequest, RegistryConfig, ResourcePoolType, ResourcePreset, @@ -208,7 +210,13 @@ def test_create_orchestrator(self, factory: EntityFactory) -> None: "name": "idle", "count": 1, "image": "miner", - "resources": {"cpu_m": 1000, "memory": 1024}, + "resources": { + "cpu": 1, + "memory": 1024, + "nvidia_gpu": 1, + "amd_gpu": 2, + "intel_gpu": 3, + }, }, { "name": "idle", @@ -216,7 +224,7 @@ def test_create_orchestrator(self, factory: EntityFactory) -> None: "image": "miner", "command": ["bash"], "args": ["-c", "sleep infinity"], - "resources": {"cpu_m": 1000, "memory": 1024}, + "resources": {"cpu": 1, "memory": 1024}, "env": {"NAME": "VALUE"}, "node_selector": {"label": "value"}, "image_pull_secret": "secret", @@ -242,7 +250,13 @@ def test_create_orchestrator(self, factory: EntityFactory) -> None: name="idle", count=1, image="miner", - resources=Resources(cpu_m=1000, memory=1024), + resources=Resources( + cpu=1, + memory=1024, + nvidia_gpu=1, + amd_gpu=2, + intel_gpu=3, + ), ), IdleJobConfig( name="idle", @@ -250,7 +264,7 @@ def test_create_orchestrator(self, factory: EntityFactory) -> None: image="miner", command=["bash"], args=["-c", "sleep infinity"], - resources=Resources(cpu_m=1000, memory=1024), + resources=Resources(cpu=1, memory=1024), env={"NAME": "VALUE"}, node_selector={"label": "value"}, image_pull_secret="secret", @@ -565,7 +579,6 @@ def google_cloud_provider_response(self) -> dict[str, Any]: }, "node_pools": [ { - "id": "n1_highmem_8", "name": "n1-highmem-8", "role": "platform_job", "machine_type": "n1-highmem-8", @@ -576,9 +589,9 @@ def google_cloud_provider_response(self) -> dict[str, Any]: "memory": 52 * 1024, "available_memory": 45 * 1024, "disk_size": 700, + "available_disk_size": 670, }, { - "id": "n1_highmem_32", "name": "n1-highmem-32-1xk80-preemptible", "role": "platform_job", "machine_type": "n1-highmem-32", @@ -590,13 +603,13 @@ def google_cloud_provider_response(self) -> dict[str, Any]: "memory": 208 * 1024, "available_memory": 201 * 1024, "disk_size": 700, + "available_disk_size": 670, "nvidia_gpu": 1, "nvidia_gpu_model": "nvidia-tesla-k80", "is_preemptible": True, }, ], "storage": { - "id": "premium", "description": "GCP Filestore (Premium)", "backend": "filestore", "tier": "PREMIUM", @@ -629,7 +642,6 @@ def google_cloud_provider(self) -> GoogleCloudProvider: node_pools=[ NodePool( name="n1-highmem-8", - id="n1_highmem_8", machine_type="n1-highmem-8", min_size=0, max_size=1, @@ -638,10 +650,10 @@ def google_cloud_provider(self) -> GoogleCloudProvider: memory=52 * 1024, available_memory=45 * 1024, disk_size=700, + available_disk_size=670, ), NodePool( name="n1-highmem-32-1xk80-preemptible", - id="n1_highmem_32", machine_type="n1-highmem-32", min_size=0, max_size=1, @@ -651,13 +663,13 @@ def google_cloud_provider(self) -> GoogleCloudProvider: memory=208 * 1024, available_memory=201 * 1024, disk_size=700, + available_disk_size=670, nvidia_gpu=1, nvidia_gpu_model="nvidia-tesla-k80", is_preemptible=True, ), ], storage=GoogleStorage( - id="premium", description="GCP Filestore (Premium)", tier=GoogleFilestoreTier.PREMIUM, instances=[ @@ -689,7 +701,6 @@ def aws_cloud_provider_response(self) -> dict[str, Any]: }, "node_pools": [ { - "id": "m5_2xlarge_8", "role": "platform_job", "name": "m5-2xlarge", "machine_type": "m5.2xlarge", @@ -700,9 +711,9 @@ def aws_cloud_provider_response(self) -> dict[str, Any]: "memory": 32 * 1024, "available_memory": 28 * 1024, "disk_size": 700, + "available_disk_size": 670, }, { - "id": "p2_xlarge_4", "role": "platform_job", "name": "p2-xlarge-1xk80-preemptible", "machine_type": "p2.xlarge", @@ -714,13 +725,13 @@ def aws_cloud_provider_response(self) -> dict[str, Any]: "memory": 61 * 1024, "available_memory": 57 * 1024, "disk_size": 700, + "available_disk_size": 670, "nvidia_gpu": 1, "nvidia_gpu_model": "nvidia-tesla-k80", "is_preemptible": True, }, ], "storage": { - "id": "generalpurpose_bursting", "description": "AWS EFS (generalPurpose, bursting)", "performance_mode": "generalPurpose", "throughput_mode": "bursting", @@ -743,7 +754,6 @@ def aws_cloud_provider(self) -> AWSCloudProvider: node_pools=[ NodePool( name="m5-2xlarge", - id="m5_2xlarge_8", machine_type="m5.2xlarge", min_size=0, max_size=1, @@ -752,10 +762,10 @@ def aws_cloud_provider(self) -> AWSCloudProvider: memory=32 * 1024, available_memory=28 * 1024, disk_size=700, + available_disk_size=670, ), NodePool( name="p2-xlarge-1xk80-preemptible", - id="p2_xlarge_4", machine_type="p2.xlarge", min_size=0, max_size=1, @@ -765,13 +775,13 @@ def aws_cloud_provider(self) -> AWSCloudProvider: memory=61 * 1024, available_memory=57 * 1024, disk_size=700, + available_disk_size=670, nvidia_gpu=1, nvidia_gpu_model="nvidia-tesla-k80", is_preemptible=True, ), ], storage=AWSStorage( - id="generalpurpose_bursting", description="AWS EFS (generalPurpose, bursting)", performance_mode=EFSPerformanceMode.GENERAL_PURPOSE, throughput_mode=EFSThroughputMode.BURSTING, @@ -805,7 +815,6 @@ def azure_cloud_provider_response(self) -> dict[str, Any]: }, "node_pools": [ { - "id": "standard_d8s_v3_8", "role": "platform_job", "name": "Standard_D8s_v3", "machine_type": "Standard_D8s_v3", @@ -816,9 +825,9 @@ def azure_cloud_provider_response(self) -> dict[str, Any]: "memory": 32 * 1024, "available_memory": 28 * 1024, "disk_size": 700, + "available_disk_size": 670, }, { - "id": "standard_nc6_6", "role": "platform_job", "name": "Standard_NC6-1xk80-preemptible", "machine_type": "Standard_NC6", @@ -830,13 +839,13 @@ def azure_cloud_provider_response(self) -> dict[str, Any]: "memory": 56 * 1024, "available_memory": 50 * 1024, "disk_size": 700, + "available_disk_size": 670, "nvidia_gpu": 1, "nvidia_gpu_model": "nvidia-tesla-k80", "is_preemptible": True, }, ], "storage": { - "id": "premium_lrs", "description": "Azure Files (Premium, LRS replication)", "tier": "Premium", "replication_type": "LRS", @@ -861,7 +870,6 @@ def azure_cloud_provider(self) -> AzureCloudProvider: node_pools=[ NodePool( name="Standard_D8s_v3", - id="standard_d8s_v3_8", machine_type="Standard_D8s_v3", min_size=0, max_size=1, @@ -870,10 +878,10 @@ def azure_cloud_provider(self) -> AzureCloudProvider: memory=32 * 1024, available_memory=28 * 1024, disk_size=700, + available_disk_size=670, ), NodePool( name="Standard_NC6-1xk80-preemptible", - id="standard_nc6_6", machine_type="Standard_NC6", min_size=0, max_size=1, @@ -883,13 +891,13 @@ def azure_cloud_provider(self) -> AzureCloudProvider: memory=56 * 1024, available_memory=50 * 1024, disk_size=700, + available_disk_size=670, nvidia_gpu=1, nvidia_gpu_model="nvidia-tesla-k80", is_preemptible=True, ), ], storage=AzureStorage( - id="premium_lrs", description="Azure Files (Premium, LRS replication)", tier=AzureStorageTier.PREMIUM, replication_type=AzureReplicationType.LRS, @@ -969,6 +977,7 @@ def on_prem_cloud_provider(self) -> OnPremCloudProvider: memory=1024, available_memory=1024, disk_size=700, + available_disk_size=700, machine_type="cpu-machine", ), NodePool( @@ -980,6 +989,7 @@ def on_prem_cloud_provider(self) -> OnPremCloudProvider: memory=1024, available_memory=1024, disk_size=700, + available_disk_size=700, nvidia_gpu=1, nvidia_gpu_model="nvidia-tesla-k80", price=Decimal("0.9"), @@ -1019,7 +1029,6 @@ def vcd_cloud_provider_response(self) -> dict[str, Any]: }, "node_pools": [ { - "id": "master_neuro_8", "role": "platform_job", "min_size": 1, "max_size": 1, @@ -1030,9 +1039,9 @@ def vcd_cloud_provider_response(self) -> dict[str, Any]: "memory": 32 * 1024, "available_memory": 29 * 1024, "disk_size": 700, + "available_disk_size": 670, }, { - "id": "x16_neuro_16", "role": "platform_job", "min_size": 1, "max_size": 1, @@ -1043,6 +1052,7 @@ def vcd_cloud_provider_response(self) -> dict[str, Any]: "memory": 40 * 1024, "available_memory": 37 * 1024, "disk_size": 700, + "available_disk_size": 670, "nvidia_gpu": 1, "nvidia_gpu_model": "nvidia-tesla-k80", "price": "0.9", @@ -1081,25 +1091,25 @@ def vcd_cloud_provider(self) -> VCDCloudProvider: min_size=1, max_size=1, name="Master-neuro", - id="master_neuro_8", machine_type="Master-neuro", cpu=8.0, available_cpu=7.0, memory=32 * 1024, available_memory=29 * 1024, disk_size=700, + available_disk_size=670, ), NodePool( min_size=1, max_size=1, name="X16-neuro-1xk80", - id="x16_neuro_16", machine_type="X16-neuro", cpu=16.0, available_cpu=15.0, memory=40 * 1024, available_memory=37 * 1024, disk_size=700, + available_disk_size=670, nvidia_gpu=1, nvidia_gpu_model="nvidia-tesla-k80", price=Decimal("0.9"), @@ -1267,7 +1277,6 @@ def test_create_minimal_credentials( @pytest.fixture def node_pool_options_response(self) -> dict[str, Any]: return { - "id": "standard_nd24s_24", "machine_type": "Standard_ND24s", "cpu": 24, "available_cpu": 23, @@ -1281,14 +1290,13 @@ def node_pool_options_response(self) -> dict[str, Any]: @pytest.fixture def node_pool_options(self) -> NodePoolOptions: return NodePoolOptions( - id="standard_nd24s_24", machine_type="Standard_ND24s", cpu=24, available_cpu=23, memory=458752, available_memory=452608, - gpu=4, - gpu_model="nvidia-tesla-p40", + nvidia_gpu=4, + nvidia_gpu_model="nvidia-tesla-p40", ) def test_aws_cloud_provider_options( @@ -1299,34 +1307,18 @@ def test_aws_cloud_provider_options( ) -> None: response = { "node_pools": [node_pool_options_response], - "storages": [ - { - "id": "generalpurpose_bursting", - "performance_mode": "generalPurpose", - "throughput_mode": "bursting", - } - ], } result = factory.create_cloud_provider_options(CloudProviderType.AWS, response) assert result == CloudProviderOptions( type=CloudProviderType.AWS, node_pools=[node_pool_options], - storages=[ - AWSStorageOptions( - id="generalpurpose_bursting", - performance_mode=EFSPerformanceMode.GENERAL_PURPOSE, - throughput_mode=EFSThroughputMode.BURSTING, - ) - ], ) def test_aws_cloud_provider_options_defaults(self, factory: EntityFactory) -> None: result = factory.create_cloud_provider_options(CloudProviderType.AWS, {}) - assert result == CloudProviderOptions( - type=CloudProviderType.AWS, node_pools=[], storages=[] - ) + assert result == CloudProviderOptions(type=CloudProviderType.AWS, node_pools=[]) def test_google_cloud_provider_options( self, @@ -1336,28 +1328,11 @@ def test_google_cloud_provider_options( ) -> None: response = { "node_pools": [node_pool_options_response], - "storages": [ - { - "id": "standard", - "tier": "STANDARD", - "min_capacity": 1099511627776, - "max_capacity": 70258793014886, - } - ], } result = factory.create_cloud_provider_options(CloudProviderType.GCP, response) assert result == CloudProviderOptions( - type=CloudProviderType.GCP, - node_pools=[node_pool_options], - storages=[ - GoogleStorageOptions( - id="standard", - tier=GoogleFilestoreTier.STANDARD, - min_capacity=1099511627776, - max_capacity=70258793014886, - ) - ], + type=CloudProviderType.GCP, node_pools=[node_pool_options] ) def test_azure_cloud_provider_options( @@ -1368,32 +1343,13 @@ def test_azure_cloud_provider_options( ) -> None: response = { "node_pools": [node_pool_options_response], - "storages": [ - { - "id": "standard_lrs", - "tier": "Standard", - "replication_type": "LRS", - "min_file_share_size": 1073741824, - "max_file_share_size": 5497558138880, - } - ], } result = factory.create_cloud_provider_options( CloudProviderType.AZURE, response ) assert result == CloudProviderOptions( - type=CloudProviderType.AZURE, - node_pools=[node_pool_options], - storages=[ - AzureStorageOptions( - id="standard_lrs", - tier=AzureStorageTier.STANDARD, - replication_type=AzureReplicationType.LRS, - min_file_share_size=1073741824, - max_file_share_size=5497558138880, - ) - ], + type=CloudProviderType.AZURE, node_pools=[node_pool_options] ) def test_vcd_cloud_provider_options_defaults( @@ -1414,7 +1370,6 @@ def test_vcd_cloud_provider_options_defaults( assert result == VCDCloudProviderOptions( type=CloudProviderType.VCD_MTS, node_pools=[node_pool_options], - storages=[], kubernetes_node_pool_id="master_neuro_2", platform_node_pool_id="master_neuro_2", ) @@ -1443,7 +1398,6 @@ def test_vcd_cloud_provider_options( assert result == VCDCloudProviderOptions( type=CloudProviderType.VCD_MTS, node_pools=[node_pool_options], - storages=[], kubernetes_node_pool_id="master_neuro_2", platform_node_pool_id="master_neuro_2", url=URL("https://vcd"), @@ -1501,6 +1455,57 @@ class TestPayloadFactory: def factory(self) -> PayloadFactory: return PayloadFactory() + def test_create_patch_cluster_request( + self, factory: PayloadFactory, credentials: CredentialsConfig + ) -> None: + result = factory.create_patch_cluster_request( + PatchClusterRequest( + credentials=credentials, + storage=StorageConfig(url=URL("https://storage-dev.neu.ro")), + registry=RegistryConfig(url=URL("https://registry-dev.neu.ro")), + orchestrator=PatchOrchestratorConfigRequest(), + monitoring=MonitoringConfig(url=URL("https://monitoring-dev.neu.ro")), + secrets=SecretsConfig(url=URL("https://secrets-dev.neu.ro")), + metrics=MetricsConfig(url=URL("https://metrics-dev.neu.ro")), + disks=DisksConfig( + url=URL("https://metrics-dev.neu.ro"), storage_limit_per_user=1024 + ), + buckets=BucketsConfig( + url=URL("https://buckets-dev.neu.ro"), disable_creation=True + ), + ingress=IngressConfig(acme_environment=ACMEEnvironment.PRODUCTION), + dns=DNSConfig( + name="neu.ro", + a_records=[ARecord(name="*.jobs-dev.neu.ro.", ips=["192.168.0.2"])], + ), + timezone=ZoneInfo("America/Los_Angeles"), + energy=EnergyConfig(co2_grams_eq_per_kwh=100), + ) + ) + + assert result == { + "credentials": mock.ANY, + "storage": mock.ANY, + "registry": mock.ANY, + "orchestrator": mock.ANY, + "monitoring": mock.ANY, + "secrets": mock.ANY, + "metrics": mock.ANY, + "disks": mock.ANY, + "buckets": mock.ANY, + "ingress": mock.ANY, + "dns": mock.ANY, + "timezone": "America/Los_Angeles", + "energy": mock.ANY, + } + + def test_create_patch_cluster_request_default( + self, factory: PayloadFactory + ) -> None: + result = factory.create_patch_cluster_request(PatchClusterRequest()) + + assert result == {} + def test_create_orchestrator(self, factory: PayloadFactory) -> None: result = factory.create_orchestrator( OrchestratorConfig( @@ -1527,7 +1532,13 @@ def test_create_orchestrator(self, factory: PayloadFactory) -> None: name="idle", count=1, image="miner", - resources=Resources(cpu_m=1000, memory=1024), + resources=Resources( + cpu=1, + memory=1024, + nvidia_gpu=1, + amd_gpu=2, + intel_gpu=3, + ), ), IdleJobConfig( name="idle", @@ -1535,7 +1546,7 @@ def test_create_orchestrator(self, factory: PayloadFactory) -> None: image="miner", command=["bash"], args=["-c", "sleep infinity"], - resources=Resources(cpu_m=1000, memory=1024), + resources=Resources(cpu=1, memory=1024), env={"NAME": "VALUE"}, node_selector={"label": "value"}, image_pull_secret="secret", @@ -1561,7 +1572,13 @@ def test_create_orchestrator(self, factory: PayloadFactory) -> None: "name": "idle", "count": 1, "image": "miner", - "resources": {"cpu_m": 1000, "memory": 1024}, + "resources": { + "cpu": 1, + "memory": 1024, + "nvidia_gpu": 1, + "amd_gpu": 2, + "intel_gpu": 3, + }, }, { "name": "idle", @@ -1569,7 +1586,7 @@ def test_create_orchestrator(self, factory: PayloadFactory) -> None: "image": "miner", "command": ["bash"], "args": ["-c", "sleep infinity"], - "resources": {"cpu_m": 1000, "memory": 1024}, + "resources": {"cpu": 1, "memory": 1024}, "env": {"NAME": "VALUE"}, "node_selector": {"label": "value"}, "image_pull_secret": "secret", @@ -1597,6 +1614,69 @@ def test_create_orchestrator_default(self, factory: PayloadFactory) -> None: "is_http_ingress_secure": False, } + def test_create_patch_orchestrator_request(self, factory: PayloadFactory) -> None: + result = factory.create_patch_orchestrator_request( + PatchOrchestratorConfigRequest( + job_hostname_template="{job_id}.jobs-dev.neu.ro", + job_internal_hostname_template="{job_id}.platform-jobs", + job_fallback_hostname="default.jobs-dev.neu.ro", + job_schedule_timeout_s=1, + job_schedule_scale_up_timeout_s=2, + is_http_ingress_secure=False, + allow_privileged_mode=True, + allow_job_priority=True, + resource_pool_types=[ResourcePoolType(name="cpu")], + resource_presets=[ + ResourcePreset( + name="cpu-micro", + credits_per_hour=Decimal(10), + cpu=0.1, + memory=100, + ) + ], + pre_pull_images=["neuromation/base"], + idle_jobs=[ + IdleJobConfig( + name="idle", + count=1, + image="miner", + resources=Resources(cpu=1, memory=1024), + ) + ], + ) + ) + + assert result == { + "job_hostname_template": "{job_id}.jobs-dev.neu.ro", + "job_internal_hostname_template": "{job_id}.platform-jobs", + "job_fallback_hostname": "default.jobs-dev.neu.ro", + "job_schedule_timeout_s": 1, + "job_schedule_scale_up_timeout_s": 2, + "is_http_ingress_secure": False, + "resource_pool_types": [mock.ANY], + "resource_presets": [mock.ANY], + "allow_privileged_mode": True, + "allow_job_priority": True, + "pre_pull_images": ["neuromation/base"], + "idle_jobs": [ + { + "name": "idle", + "count": 1, + "image": "miner", + "resources": {"cpu": 1, "memory": 1024}, + } + ], + } + + def test_create_patch_orchestrator_request_default( + self, factory: PayloadFactory + ) -> None: + result = factory.create_patch_orchestrator_request( + PatchOrchestratorConfigRequest() + ) + + assert result == {} + def test_create_resource_pool_type( self, factory: PayloadFactory, @@ -1615,6 +1695,7 @@ def test_create_resource_pool_type( memory=12 * 1024, available_memory=10 * 1024, disk_size=700, + available_disk_size=670, nvidia_gpu=1, amd_gpu=2, intel_gpu=3, @@ -1644,6 +1725,7 @@ def test_create_resource_pool_type( "memory": 12 * 1024, "available_memory": 10 * 1024, "disk_size": 700, + "available_disk_size": 670, "nvidia_gpu": 1, "amd_gpu": 2, "intel_gpu": 3, @@ -1667,15 +1749,16 @@ def test_create_empty_resource_pool_type(self, factory: PayloadFactory) -> None: assert result == { "name": "node-pool", + "cpu": 1.0, "available_cpu": 1.0, + "memory": 2**30, "available_memory": 2**30, - "cpu": 1.0, "disk_size": 150 * 2**30, + "available_disk_size": 150 * 2**30, "idle_size": 0, "is_preemptible": False, - "max_size": 1, - "memory": 2**30, "min_size": 0, + "max_size": 1, } def test_create_tpu_resource(self, factory: PayloadFactory) -> None: @@ -1996,7 +2079,6 @@ def test_create_minimal_credentials( def node_pool(self) -> NodePool: return NodePool( name="my-node-pool", - id="someid", min_size=0, max_size=10, idle_size=1, @@ -2006,6 +2088,7 @@ def node_pool(self) -> NodePool: memory=2048, available_memory=1024, disk_size=100500, + available_disk_size=100000, disk_type="some-disk-type", nvidia_gpu=1, nvidia_gpu_model="some-gpu-model", @@ -2017,10 +2100,32 @@ def node_pool(self) -> NodePool: cpu_max_watts=1000, ) - def test_node_pool(self, factory: PayloadFactory, node_pool: NodePool) -> None: - payload = factory.create_node_pool(node_pool) + def test_create_add_node_pool_request(self, factory: PayloadFactory) -> None: + node_pool = AddNodePoolRequest( + name="my-node-pool", + min_size=0, + max_size=10, + idle_size=1, + machine_type="some-machine-type", + cpu=10, + available_cpu=9, + memory=2048, + available_memory=1024, + disk_size=100500, + available_disk_size=100000, + disk_type="some-disk-type", + nvidia_gpu=1, + nvidia_gpu_model="some-gpu-model", + price=Decimal(180), + currency="rabbits", + is_preemptible=True, + zones=("here", "there"), + cpu_min_watts=0.01, + cpu_max_watts=1000, + ) + payload = factory.create_add_node_pool_request(node_pool) + assert payload == { - "id": "someid", "name": "my-node-pool", "role": "platform_job", "min_size": 0, @@ -2033,6 +2138,7 @@ def test_node_pool(self, factory: PayloadFactory, node_pool: NodePool) -> None: "memory": 2048, "available_memory": 1024, "disk_size": 100500, + "available_disk_size": 100000, "disk_type": "some-disk-type", "nvidia_gpu": 1, "nvidia_gpu_model": "some-gpu-model", @@ -2043,33 +2149,100 @@ def test_node_pool(self, factory: PayloadFactory, node_pool: NodePool) -> None: "cpu_max_watts": 1000, } - np = replace( - node_pool, - cpu=None, - available_cpu=None, - memory=None, - available_memory=None, - zones=None, - ) + def test_create_add_node_pool_request_default( + self, factory: PayloadFactory + ) -> None: + node_pool = AddNodePoolRequest(name="my-node-pool", min_size=0, max_size=1) + + payload = factory.create_add_node_pool_request(node_pool) - payload = factory.create_node_pool(np) assert payload == { - "id": "someid", "name": "my-node-pool", "role": "platform_job", + "min_size": 0, + "max_size": 1, + } + + def test_create_patch_node_pool_size_request(self, factory: PayloadFactory) -> None: + payload = factory.create_patch_node_pool_request( + PatchNodePoolSizeRequest(min_size=1, max_size=3, idle_size=2) + ) + + assert payload == { + "min_size": 1, + "max_size": 3, + "idle_size": 2, + } + + def test_create_patch_node_pool_size_request_default( + self, factory: PayloadFactory + ) -> None: + payload = factory.create_patch_node_pool_request(PatchNodePoolSizeRequest()) + + assert payload == {} + + def test_create_patch_node_pool_resources_request( + self, factory: PayloadFactory + ) -> None: + payload = factory.create_patch_node_pool_request( + PatchNodePoolResourcesRequest( + min_size=0, + max_size=10, + machine_type="n1-highmem-8", + cpu=1, + available_cpu=0.9, + memory=1024, + available_memory=1023, + disk_size=100, + available_disk_size=75, + nvidia_gpu=1, + nvidia_gpu_model="nvidia-gpu", + amd_gpu=1, + amd_gpu_model="amd-gpu", + intel_gpu=1, + intel_gpu_model="intel-gpu", + ) + ) + + assert payload == { "min_size": 0, "max_size": 10, - "idle_size": 1, - "is_preemptible": True, - "machine_type": "some-machine-type", - "disk_size": 100500, - "disk_type": "some-disk-type", + "machine_type": "n1-highmem-8", + "cpu": 1, + "available_cpu": 0.9, + "memory": 1024, + "available_memory": 1023, + "disk_size": 100, + "available_disk_size": 75, "nvidia_gpu": 1, - "nvidia_gpu_model": "some-gpu-model", - "price": "180", - "currency": "rabbits", - "cpu_min_watts": 0.01, - "cpu_max_watts": 1000, + "nvidia_gpu_model": "nvidia-gpu", + "amd_gpu": 1, + "amd_gpu_model": "amd-gpu", + "intel_gpu": 1, + "intel_gpu_model": "intel-gpu", + } + + def test_create_patch_node_pool_resources_request_default( + self, factory: PayloadFactory + ) -> None: + payload = factory.create_patch_node_pool_request( + PatchNodePoolResourcesRequest( + cpu=1, + available_cpu=0.9, + memory=1024, + available_memory=1023, + disk_size=100, + available_disk_size=75, + ) + ) + + assert payload == { + "cpu": 1, + "available_cpu": 0.9, + "memory": 1024, + "available_memory": 1023, + "disk_size": 100, + "available_disk_size": 75, } def test_create_energy(self, factory: PayloadFactory) -> None: