diff --git a/examples/serve/min_replicas_zero.yaml b/examples/serve/min_replicas_zero.yaml new file mode 100644 index 00000000000..e6e54cd41f2 --- /dev/null +++ b/examples/serve/min_replicas_zero.yaml @@ -0,0 +1,27 @@ +# SkyServe YAML to test min_replicas=0 with a simple http server. +# The service will be initialized with no replica (min_replicas = 0). +# Any traffic to the service will trigger an immediate scale-up. +# The service will be scaled down to 0 replica when there is no traffic +# for a long time. +# +# Usage: +# sky serve up -n min_replicas examples/serve/min_replicas_zero.yaml +# The endpoint will be printed in the console. +# Querying the endpoint will trigger a scale up. + +service: + readiness_probe: + path: /health + initial_delay_seconds: 20 + replica_policy: + min_replicas: 0 + max_replicas: 2 + target_qps_per_replica: 1 + +resources: + ports: 8081 + cpus: 2+ + +workdir: examples/serve/http_server + +run: python3 server.py diff --git a/sky/cli.py b/sky/cli.py index e457269e0b4..a66dffbee64 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4386,6 +4386,9 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): down. This usually indicates resource leakages. If you see such status, please login to the cloud console and double-check + - ``NO_REPLICAS``: The service has no replicas. This usually happens when + min_replicas is set to 0 and there is no traffic to the system. + Each replica can have one of the following statuses: - ``PENDING``: The maximum number of simultaneous launches has been reached diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 29f58a21370..fd5dc4b6148 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -154,7 +154,7 @@ def _get_desired_num_replicas(self) -> int: logger.info(f'Requests per second: {num_requests_per_second}, ' f'Current target number of replicas: {target_num_replicas}') - if not self.bootstrap_done: + if not self.bootstrap_done or self.target_num_replicas == 0: self.bootstrap_done = True return target_num_replicas elif target_num_replicas > self.target_num_replicas: @@ -173,6 +173,14 @@ def _get_desired_num_replicas(self) -> int: self.upscale_counter = self.downscale_counter = 0 return self.target_num_replicas + def get_decision_interval(self) -> int: + # Reduce autoscaler interval when target_num_replicas = 0. + # This will happen when min_replicas = 0 and no traffic. + if self.target_num_replicas == 0: + return constants.AUTOSCALER_NO_REPLICA_DECISION_INTERVAL_SECONDS + else: + return constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS + def evaluate_scaling( self, replica_infos: List['replica_managers.ReplicaInfo'], diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 73e2e21c2ca..c292a59e88b 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -36,6 +36,8 @@ # Autoscaler scale decision interval in seconds. # We will try to scale up/down every `decision_interval`. AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS = 20 +# Autoscaler no replica decision interval in seconds. +AUTOSCALER_NO_REPLICA_DECISION_INTERVAL_SECONDS = 5 # Autoscaler default upscale delays in seconds. # We will upscale only if the target number of instances # is larger than the current launched instances for delay amount of time. diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 8497da07afc..36e5e97c74a 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -88,7 +88,7 @@ def _run_autoscaler(self): f'{common_utils.format_exception(e)}') with ux_utils.enable_traceback(): logger.error(f' Traceback: {traceback.format_exc()}') - time.sleep(constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS) + time.sleep(self._autoscaler.get_decision_interval()) def run(self) -> None: diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index b09dd063c90..10c6a505267 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -153,6 +153,9 @@ class ServiceStatus(enum.Enum): # Clean up failed FAILED_CLEANUP = 'FAILED_CLEANUP' + # No replica + NO_REPLICA = 'NO_REPLICA' + @classmethod def failed_statuses(cls) -> List['ServiceStatus']: return [cls.CONTROLLER_FAILED, cls.FAILED_CLEANUP] @@ -175,6 +178,9 @@ def from_replica_statuses( if sum(status2num[status] for status in ReplicaStatus.failed_statuses()) > 0: return cls.FAILED + # When min_replicas = 0, there is no (provisioning) replica. + if len(replica_statuses) == 0: + return cls.NO_REPLICA return cls.REPLICA_INIT @@ -186,6 +192,7 @@ def from_replica_statuses( ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, ServiceStatus.FAILED: colorama.Fore.RED, ServiceStatus.FAILED_CLEANUP: colorama.Fore.RED, + ServiceStatus.NO_REPLICA: colorama.Fore.MAGENTA, } diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index d73477c4b95..06a7fe8d540 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -33,9 +33,9 @@ def __init__( qps_upper_threshold: Optional[float] = None, qps_lower_threshold: Optional[float] = None, ) -> None: - if min_replicas <= 0: + if min_replicas < 0: with ux_utils.print_exception_no_traceback(): - raise ValueError('min_replicas must be greater than 0') + raise ValueError('min_replicas must be greater or equal to 0') if max_replicas is not None and max_replicas < min_replicas: with ux_utils.print_exception_no_traceback(): raise ValueError(