From 9a645a4e3c5731ee21bfec7d1467a5a20634abc7 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Thu, 13 Jul 2023 22:51:22 +0000 Subject: [PATCH 001/223] init --- sky/serve/autoscalers.py | 147 +++++++++++++++++++++ sky/serve/common.py | 31 +++++ sky/serve/controller.py | 116 +++++++++++++++++ sky/serve/examples/api_server.yaml | 56 ++++++++ sky/serve/examples/tgi_coder.yaml | 16 +++ sky/serve/infra_providers.py | 203 +++++++++++++++++++++++++++++ sky/serve/load_balancers.py | 160 +++++++++++++++++++++++ sky/serve/redirector.py | 114 ++++++++++++++++ sky/task.py | 3 + sky/utils/schemas.py | 3 + 10 files changed, 849 insertions(+) create mode 100644 sky/serve/autoscalers.py create mode 100644 sky/serve/common.py create mode 100644 sky/serve/controller.py create mode 100644 sky/serve/examples/api_server.yaml create mode 100644 sky/serve/examples/tgi_coder.yaml create mode 100644 sky/serve/infra_providers.py create mode 100644 sky/serve/load_balancers.py create mode 100644 sky/serve/redirector.py diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py new file mode 100644 index 00000000000..a9075beedfb --- /dev/null +++ b/sky/serve/autoscalers.py @@ -0,0 +1,147 @@ +import logging +import time + +from sky.serve.infra_providers import InfraProvider +from sky.serve.load_balancers import LoadBalancer + +logger = logging.getLogger(__name__) + + +class Autoscaler: + + def __init__(self, + infra_provider: InfraProvider, + load_balancer: LoadBalancer, + frequency: int = 60): + self.infra_provider = infra_provider + self.load_balancer = load_balancer + self.frequency = frequency # Time to sleep in seconds. + + def evaluate_scaling(self): + raise NotImplementedError + + def scale_up(self, num_nodes_to_add: int): + logger.debug(f'Scaling up by {num_nodes_to_add} nodes') + self.infra_provider.scale_up(num_nodes_to_add) + + def scale_down(self, num_nodes_to_remove): + logger.debug(f'Scaling down by {num_nodes_to_remove} nodes') + self.infra_provider.scale_down(num_nodes_to_remove) + + def monitor(self): + logger.info('Starting autoscaler monitor.') + while True: + self.evaluate_scaling() + time.sleep(self.frequency) + + +class LatencyThresholdAutoscaler(Autoscaler): + + def __init__(self, + *args, + upper_threshold: int = 50, + lower_threshold: int = 1, + min_nodes: int = 1, + **kwargs): + ''' + Autoscaler that scales up when the average latency of all servers is above the upper threshold and scales down + when the average latency of all servers is below the lower threshold. + :param args: + :param upper_threshold: upper threshold for latency in seconds + :param lower_threshold: lower threshold for latency in seconds + :param min_nodes: minimum number of nodes to keep running + :param kwargs: + ''' + super().__init__(*args, **kwargs) + self.upper_threshold = upper_threshold + self.lower_threshold = lower_threshold + self.min_nodes = min_nodes + + def evaluate_scaling(self): + server_loads = self.load_balancer.server_loads + if not server_loads: + return + + avg_latencies = [ + sum(latencies) / len(latencies) + for latencies in server_loads.values() + ] + + if all(latency > self.upper_threshold for latency in avg_latencies): + self.scale_up(1) + elif all(latency < self.lower_threshold for latency in avg_latencies): + if self.infra_provider.total_servers() > self.min_nodes: + self.scale_down(1) + + +class RequestRateAutoscaler(Autoscaler): + + def __init__(self, + *args, + query_interval: int = 10, + upper_threshold: int = 10, + lower_threshold: int = 2, + min_nodes: int = 1, + cooldown: int = 60, + **kwargs): + """ + Autoscaler that scales when the number of requests in the given interval is above or below the upper threshold + :param args: + :param query_interval: + :param upper_threshold: + :param lower_threshold: + :param min_nodes: + :param cooldown: Seconds to wait before scaling again. + :param kwargs: + """ + super().__init__(*args, **kwargs) + self.query_interval = query_interval + self.upper_threshold = upper_threshold + self.lower_threshold = lower_threshold + self.min_nodes = min_nodes + self.cooldown = cooldown + self.last_scale_operation = 0 # Time of last scale operation. + + def evaluate_scaling(self): + current_time = time.time() + + # Check if cooldown period has passed since the last scaling operation + if current_time - self.last_scale_operation < self.cooldown: + logger.info( + f'Current time: {current_time}, last scale operation: {self.last_scale_operation}, cooldown: {self.cooldown}' + ) + logger.info( + f'Cooldown period has not passed since last scaling operation. Skipping scaling.' + ) + return + + while (self.load_balancer.request_timestamps and + current_time - self.load_balancer.request_timestamps[0] > + self.query_interval): + self.load_balancer.request_timestamps.popleft() + + num_requests = len(self.load_balancer.request_timestamps) + num_nodes = self.infra_provider.total_servers() + requests_per_node = num_requests / num_nodes if num_nodes else num_requests # To account for zero case. + + logger.info(f'Requests per node: {requests_per_node}') + logger.info( + f'Upper threshold: {self.upper_threshold} q/node, lower threshold: {self.lower_threshold} q/node, queries per node: {requests_per_node} q/node' + ) + + scaled = True + # Bootstrap case + logger.info(f'Number of nodes: {num_nodes}') + if num_nodes == 0 and requests_per_node > 0: + logger.info(f'Bootstrapping autoscaler.') + self.scale_up(1) + self.last_scale_operation = current_time + elif requests_per_node > self.upper_threshold: + self.scale_up(1) + self.last_scale_operation = current_time + elif requests_per_node < self.lower_threshold: + if self.infra_provider.total_servers() > self.min_nodes: + self.scale_down(1) + self.last_scale_operation = current_time + else: + logger.info(f'No scaling needed.') diff --git a/sky/serve/common.py b/sky/serve/common.py new file mode 100644 index 00000000000..c3526ac70f5 --- /dev/null +++ b/sky/serve/common.py @@ -0,0 +1,31 @@ +import yaml + +class SkyServiceSpec: + + def __init__(self, yaml_path: str): + with open(yaml_path, 'r') as f: + self.task = yaml.safe_load(f) + if 'service' not in self.task: + raise ValueError('Task YAML must have a "service" section') + if 'port' not in self.task['service']: + raise ValueError('Task YAML must have a "port" section') + if 'readiness_probe' not in self.task['service']: + raise ValueError('Task YAML must have a "readiness_probe" section') + self._readiness_path = self.get_readiness_path() + self._app_port = self.get_app_port() + + def get_readiness_path(self): + # TODO: check if the path is valid + return f':{self.task["service"]["port"]}{self.task["service"]["readiness_probe"]}' + + def get_app_port(self): + # TODO: check if the port is valid + return f'{self.task["service"]["port"]}' + + @property + def readiness_path(self): + return self._readiness_path + + @property + def app_port(self): + return self._app_port diff --git a/sky/serve/controller.py b/sky/serve/controller.py new file mode 100644 index 00000000000..ed79f583a1c --- /dev/null +++ b/sky/serve/controller.py @@ -0,0 +1,116 @@ +import logging + +import argparse + +from sky.serve.autoscalers import RequestRateAutoscaler, Autoscaler +from sky.serve.common import SkyServiceSpec +from sky.serve.infra_providers import InfraProvider, SkyPilotInfraProvider +from sky.serve.load_balancers import RoundRobinLoadBalancer, LoadBalancer + +import time +import threading + +from fastapi import FastAPI, Request +import uvicorn + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s | %(levelname)-6s | %(name)-40s || %(message)s', + datefmt='%m-%d %H:%M:%S', + force=True) +logger = logging.getLogger(__name__) + + +class Controller: + + def __init__(self, + infra_provider: InfraProvider, + load_balancer: LoadBalancer, + autoscaler: Autoscaler = None, + port: int = 8082): + self.port = port + self.infra_provider = infra_provider + self.load_balancer = load_balancer + self.autoscaler = autoscaler + self.app = FastAPI() + + def server_fetcher(self): + while True: + logger.info('Running server fetcher.') + server_ips = self.infra_provider.get_server_ips() + self.load_balancer.probe_endpoints(server_ips) + time.sleep(10) + + def run(self): + + @self.app.post('/controller/increment_request_count') + async def increment_request_count(request: Request): + # await request + request_data = await request.json() + # get request data + count = 0 if 'counts' not in request_data else request_data['counts'] + logger.info(f'Received request: {request_data}') + self.load_balancer.increment_request_count(count=count) + return {'message': 'Success'} + + @self.app.get('/controller/get_server_ips') + def get_server_ips(): + return {'server_ips': list(self.load_balancer.servers_queue)} + + # Run server_monitor and autoscaler.monitor (if autoscaler is defined) in separate threads in the background. This should not block the main thread. + server_fetcher_thread = threading.Thread(target=self.server_fetcher, + daemon=True) + server_fetcher_thread.start() + if self.autoscaler: + autoscaler_monitor_thread = threading.Thread( + target=self.autoscaler.monitor, daemon=True) + autoscaler_monitor_thread.start() + + logger.info(f'Sky Server started on http://0.0.0.0:{self.port}') + uvicorn.run(self.app, host='0.0.0.0', port=self.port) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='SkyServe Server') + parser.add_argument('--task-yaml', type=str, help='Task YAML file', required=True) + parser.add_argument('--port', + '-p', + type=int, + help='Port to run the controller', + default=8082) + parser.add_argument('--min-nodes', + type=int, + default=1, + help='Minimum nodes to keep running') + args = parser.parse_args() + + # ======= Infra Provider ========= + # infra_provider = DummyInfraProvider() + infra_provider = SkyPilotInfraProvider(args.task_yaml) + + # ======= Load Balancer ========= + service_spec = SkyServiceSpec(args.task_yaml) + # Select the load balancing policy: RoundRobinLoadBalancer or LeastLoadedLoadBalancer + load_balancer = RoundRobinLoadBalancer( + infra_provider=infra_provider, + endpoint_path=service_spec.readiness_path) + # load_balancer = LeastLoadedLoadBalancer(n=5) + # autoscaler = LatencyThresholdAutoscaler(load_balancer, + # upper_threshold=0.5, # 500ms + # lower_threshold=0.1) # 100ms + + # ======= Autoscaler ========= + # Create an autoscaler with the RequestRateAutoscaler policy. Thresholds are defined as requests per node in the defined interval. + autoscaler = RequestRateAutoscaler(infra_provider, + load_balancer, + frequency=5, + query_interval=60, + lower_threshold=0, + upper_threshold=1, + min_nodes=args.min_nodes, + cooldown=60) + + # ======= Controller ========= + # Create a controller object and run it. + controller = Controller(infra_provider, load_balancer, autoscaler, args.port) + controller.run() diff --git a/sky/serve/examples/api_server.yaml b/sky/serve/examples/api_server.yaml new file mode 100644 index 00000000000..6aabf130553 --- /dev/null +++ b/sky/serve/examples/api_server.yaml @@ -0,0 +1,56 @@ +resources: + accelerators: A100:1 + cloud: gcp + # region: us-central1 + # use_spot: True + image_id: projects/skypilot-375900/global/images/fastchat-serve-v0 + +num_nodes: 1 + +file_mounts: + ~/chatlogs: + name: skypilot-chatbot-logs + store: gcs + mode: MOUNT + +service: + port: 8081 + readiness_probe: /health/v1/models + +setup: | + conda activate chatbot + if [ $? -eq 0 ]; then + echo 'conda env exists' + else + # Setup the environment + conda create -n chatbot python=3.10 -y + conda activate chatbot + pip3 install fschat + fi + +run: | + conda activate chatbot + python3 -m fastchat.serve.controller --host 0.0.0.0 --port 21001 > ~/controller.log 2>&1 & + + WORKER_IP=$(hostname -I | cut -d' ' -f1) + CONTROLLER_PORT=21001 + WORKER_PORT=21002 + # python3 -m fastchat.serve.model_worker \ + # --model-path lmsys/vicuna-7b-v1.3 \ + # --controller-address http://${WORKER_IP}:${CONTROLLER_PORT} \ + # --worker-address http://${WORKER_IP}:${WORKER_PORT} \ + # --host 0.0.0.0 \ + # --port ${WORKER_PORT} > ~/worker.log 2>&1 & + + cd FastChat + python3 -m fastchat.serve.vllm_worker \ + --model-path lmsys/vicuna-7b-v1.3 \ + --controller-address http://${WORKER_IP}:${CONTROLLER_PORT} \ + --worker-address http://${WORKER_IP}:${WORKER_PORT} \ + --host 0.0.0.0 \ + --port ${WORKER_PORT} \ + --tokenizer hf-internal-testing/llama-tokenizer > ~/worker.log 2>&1 & + + + HOST_IP=$(hostname -I | cut -d' ' -f1) + python3 -m fastchat.serve.openai_api_server --host ${HOST_IP} --port 8081 diff --git a/sky/serve/examples/tgi_coder.yaml b/sky/serve/examples/tgi_coder.yaml new file mode 100644 index 00000000000..cd55cb295ce --- /dev/null +++ b/sky/serve/examples/tgi_coder.yaml @@ -0,0 +1,16 @@ +resources: + accelerators: A100:1 + cloud: gcp + image_id: projects/skypilot-375900/global/images/coder + # use_spot: True + +num_nodes: 1 + +service: + port: 8082 + readiness_probe: /health + +run: | + volume=/home/gcpuser/sky_workdir/huggingface-vscode-endpoint-server/data/ + model=WizardLM/WizardCoder-15B-V1.0 + docker run --gpus all --shm-size 1g -p 8082:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference --model-id $model \ No newline at end of file diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py new file mode 100644 index 00000000000..0c7c2b42d83 --- /dev/null +++ b/sky/serve/infra_providers.py @@ -0,0 +1,203 @@ +import logging +from typing import List +import time + +import sky +from sky.backends import backend_utils + +import urllib +import threading + +logger = logging.getLogger(__name__) + + +class InfraProvider: + + def get_server_ips(self) -> List[str]: + raise NotImplementedError + + def total_servers(self) -> int: + # Returns the total number of servers, including those under provisioning and deletion + raise NotImplementedError + + def scale_up(self) -> None: + raise NotImplementedError + + def scale_down(self) -> None: + # TODO - Scale down must also pass in a list of servers to delete or the number of servers to delete + raise NotImplementedError + + def terminate_servers(self, unhealthy_servers: List[str]): + # Terminates the servers with endpoints in the list + raise NotImplementedError + + +class DummyInfraProvider(InfraProvider): + + def __init__(self): + self.DEFAULT_ENDPOINTS = [ + 'https://httpbin.org/get?id=basecase', 'https://www.google.com', + 'http://thiswebsitedoesntexistitsonlyfortesting.com' + ] + self.current_endpoints = self.DEFAULT_ENDPOINTS.copy() + + def get_server_ips(self) -> List[str]: + logger.info('Returning current endpoints: ' + + str(self.current_endpoints)) + return self.current_endpoints + + def total_servers(self) -> int: + return len(self.current_endpoints) + + def scale_up(self, n) -> None: + logger.info('DummyInfraProvider.scale_up called with n=' + str(n) + + '. Sleeping for 30s.') + for i in range(30): + logger.info('DummyInfraProvider.scale_up: ' + str(i) + '/30') + time.sleep(1) + # Add n new endpoints + for i in range(n): + self.current_endpoints.append('https://httpbin.org/get?id=' + + str(len(self.current_endpoints))) + logger.info('DummyInfraProvider.scale_up: done sleeping.') + + def scale_down(self, n) -> None: + logger.info('DummyInfraProvider.scale_down called with n=' + str(n) + + '. Doing nothing.') + + def terminate_servers(self, unhealthy_servers: List[str]): + # Remove unhealthy servers from current_endpoints + logger.info( + 'DummyInfraProvider.terminate_servers called with unhealthy_servers=' + + str(unhealthy_servers)) + self.current_endpoints = [ + endpoint for endpoint in self.current_endpoints + if endpoint not in unhealthy_servers + ] + + +class SkyPilotInfraProvider(InfraProvider): + CLUSTER_NAME_PREFIX = 'skyserve-' + + def __init__(self, task_yaml_path: str): + self.task_yaml_path = task_yaml_path + self.id_counter = self._get_id_start() + + def _get_id_start(self): + ''' + Returns the id to start from when creating a new cluster + ''' + clusters = sky.global_user_state.get_clusters() + # Filter out clusters that don't have the prefix + clusters = [ + cluster for cluster in clusters + if self.CLUSTER_NAME_PREFIX in cluster['name'] + ] + # Get the greatest id + max_id = 0 + for cluster in clusters: + name = cluster['name'] + id = int(name.split('-')[-1]) + if id > max_id: + max_id = id + return max_id + 1 + + def _get_ip_clusname_map(self): + """Returns a map of ip to cluster name for all clusters with the prefix""" + clusters = sky.global_user_state.get_clusters() + ip_clusname_map = {} + for cluster in clusters: + name = cluster['name'] + if self.CLUSTER_NAME_PREFIX in name: + handle = cluster['handle'] + try: + # Get the head node ip + ip = backend_utils.get_node_ips(handle.cluster_yaml, + handle.launched_nodes, + handle)[0] + ip_clusname_map[ip] = name + except sky.exceptions.FetchIPError: + logger.warning(f'Unable to get IP for cluster {name}.') + continue + return ip_clusname_map + + def _get_server_ips(self): + return list(self._get_ip_clusname_map().keys()) + + def _return_total_servers(self): + clusters = sky.global_user_state.get_clusters() + # Filter out clusters that don't have the prefix + # FIXME - this is a hack to get around. should implement a better filtering mechanism + clusters = [ + cluster for cluster in clusters + if self.CLUSTER_NAME_PREFIX in cluster['name'] + ] + return len(clusters) + + def _scale_up(self, n): + # Launch n new clusters + task = sky.Task.from_yaml(self.task_yaml_path) + for i in range(0, n): + cluster_name = f'{self.CLUSTER_NAME_PREFIX}{self.id_counter}' + logger.info(f'Creating SkyPilot cluster {cluster_name}') + sky.launch(task, cluster_name=cluster_name, + detach_run=True) # TODO - make the launch parallel + self.id_counter += 1 + + def _scale_down(self, n): + # Delete n clusters + # Currently deletes the first n clusters + clusters = sky.global_user_state.get_clusters() + # Filter out clusters that don't have the prefix + clusters = [ + cluster for cluster in clusters + if self.CLUSTER_NAME_PREFIX in cluster['name'] + ] + num_clusters = len(clusters) + if num_clusters > 0: + if n > num_clusters: + logger.warning( + f'Trying to delete {n} clusters, but only {num_clusters} clusters exist. Deleting all clusters.' + ) + n = num_clusters + for i in range(0, n): + cluster = clusters[i] + logger.info(f'Deleting SkyPilot cluster {cluster["name"]}') + sky.down(cluster['name'], purge=True) + + def get_server_ips(self) -> List[str]: + ips = self._get_server_ips() + logger.info(f'Returning SkyPilot endpoints: {ips}') + return ips + + def total_servers(self) -> int: + return self._return_total_servers() + + def scale_up(self, n: int) -> None: + self._scale_up(n) + + def scale_down(self, n: int) -> None: + self._scale_down(n) + + def terminate_servers(self, unhealthy_servers: List[str]): + # Remove unhealthy servers from current_endpoints + logger.info( + 'SkyPilotInfraProvider.terminate_servers called with unhealthy_servers=' + + str(unhealthy_servers)) + for endpoint_url in unhealthy_servers: + ip_to_name_map = self._get_ip_clusname_map() + if endpoint_url not in ip_to_name_map: + logger.warning( + f'Unable to find cluster name for endpoint {endpoint_url}. Skipping.' + ) + continue + name = ip_to_name_map[endpoint_url] + if endpoint_url in unhealthy_servers: + logger.info(f'Deleting SkyPilot cluster {name}') + # Run sky.down in a daemon thread so that it doesn't block the main thread + threading.Thread(target=sky.down, + args=(name,), + kwargs={ + 'purge': True + }, + daemon=True).start() diff --git a/sky/serve/load_balancers.py b/sky/serve/load_balancers.py new file mode 100644 index 00000000000..4c0c95c6ff7 --- /dev/null +++ b/sky/serve/load_balancers.py @@ -0,0 +1,160 @@ +import time +from collections import deque + +import aiohttp +import logging + +from concurrent.futures import ThreadPoolExecutor, as_completed +import requests + +logger = logging.getLogger(__name__) + + +class LoadBalancer: + + def __init__(self, infra_provider, endpoint_path, post_data=None): + self.available_servers = [] + self.request_count = 0 + self.request_timestamps = deque() + self.infra_provider = infra_provider + self.endpoint_path = endpoint_path + self.post_data = post_data + + def increment_request_count(self, count=1): + self.request_count += count + self.request_timestamps.append(time.time()) + + def probe_endpoints(self, endpoint_ips): + raise NotImplementedError + + def select_server(self, request): + raise NotImplementedError + + +class RoundRobinLoadBalancer(LoadBalancer): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.servers_queue = deque() + self.first_unhealthy_time = {} + self.timeout = 18000 + logger.info(f'Endpoint path: {self.endpoint_path}') + + def probe_endpoints(self, endpoint_ips): + + def probe_endpoint(endpoint_ip): + try: + if self.post_data: + response = requests.post( + f'http://{endpoint_ip}{self.endpoint_path}', + json=self.post_data, + timeout=3) + else: + response = requests.get( + f'http://{endpoint_ip}{self.endpoint_path}', timeout=3) + if response.status_code == 200: + logger.info(f'Server {endpoint_ip} is available.') + return endpoint_ip + except requests.exceptions.RequestException as e: + logger.info(e) + logger.info(f'Server {endpoint_ip} is not available.') + pass + return None + + with ThreadPoolExecutor() as executor: + futures = [ + executor.submit(probe_endpoint, endpoint_url) + for endpoint_url in endpoint_ips + ] + healthy_servers = [ + future.result() + for future in as_completed(futures) + if future.result() is not None + ] + logger.info(f'Healthy servers: {healthy_servers}') + # Add newly available servers + for server in healthy_servers: + if server not in self.available_servers: + logger.info( + f'Server {server} is newly available. Adding to available servers.' + ) + self.available_servers.append(server) + self.servers_queue.append(server) + # Remove servers that are no longer available + unhealthy_servers = set() + for server in self.available_servers: + if server not in healthy_servers: + logger.info( + f'Server {server} is no longer available. Removing from available servers.' + ) + self.available_servers.remove(server) + self.servers_queue.remove(server) + unhealthy_servers.add(server) + # Tell the infra provider to remove endpoints that are no longer available + for server in endpoint_ips: + if server not in healthy_servers: + unhealthy_servers.add(server) + logger.info(f'Unhealthy servers: {unhealthy_servers}') + if unhealthy_servers: + servers_to_terminate = [] + for server in unhealthy_servers: + if server not in self.first_unhealthy_time: + self.first_unhealthy_time[server] = time.time() + elif time.time() - self.first_unhealthy_time[ + server] > self.timeout: # cooldown before terminating a dead server to avoid hysterisis + servers_to_terminate.append(server) + self.infra_provider.terminate_servers(servers_to_terminate) + + def select_server(self, request): + if not self.servers_queue: + return None + + server_ip = self.servers_queue.popleft() + self.servers_queue.append(server_ip) + logger.info(f'Selected server {server_ip} for request {request}') + return server_ip + + +class LeastLoadedLoadBalancer(LoadBalancer): + + def __init__(self, *args, n=10, **kwargs): + + super().__init__(*args, **kwargs) + self.server_loads = {} + self.n = n + + def probe_endpoints(self, endpoint_ips): + timeout = aiohttp.ClientTimeout(total=2) + with aiohttp.ClientSession(timeout=timeout) as session: + for server_ip in endpoint_ips: + try: + start_time = time() + with session.get(f'{server_ip}') as response: + if response.status == 200: + load = time() - start_time + + if server_ip not in self.server_loads: + self.server_loads[server_ip] = [load] * self.n + else: + self.server_loads[server_ip].append(load) + if len(self.server_loads[server_ip]) > self.n: + self.server_loads[server_ip].pop(0) + + if server_ip not in self.available_servers: + self.available_servers.append(server_ip) + elif server_ip in self.available_servers: + self.available_servers.remove(server_ip) + del self.server_loads[server_ip] + except: + if server_ip in self.available_servers: + self.available_servers.remove(server_ip) + del self.server_loads[server_ip] + + def select_server(self, request): + if not self.server_loads: + return None + + server_ip = min( + self.server_loads, + key=lambda x: sum(self.server_loads[x]) / len(self.server_loads[x])) + return server_ip diff --git a/sky/serve/redirector.py b/sky/serve/redirector.py new file mode 100644 index 00000000000..cca583c72e7 --- /dev/null +++ b/sky/serve/redirector.py @@ -0,0 +1,114 @@ +import time +import logging +from collections import deque + +from sky.serve.common import SkyServiceSpec + +from fastapi import FastAPI, Request, HTTPException +from fastapi.responses import RedirectResponse +import threading +import uvicorn + +import requests +import argparse + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s | %(levelname)-6s | %(name)-40s || %(message)s', + datefmt='%m-%d %H:%M:%S', + # force=True, +) +logger = logging.getLogger(__name__) + + +class SkyServeRedirector: + + def __init__(self, controller_url: str, service_spec: SkyServiceSpec, port: int = 8081): + self.controller_url = controller_url + self.port = port + self.app_port = service_spec.app_port + self.server_ips = [] + self.servers_queue = deque() + self.app = FastAPI() + self.request_count = 0 + self.controller_sync_timeout = 20 + + def sync_with_controller(self): + while True: + server_ips = [] + with requests.Session() as session: + try: + # send request count + response = session.post( + self.controller_url + + '/controller/increment_request_count', + json={'counts': self.request_count}, + timeout=5) + response.raise_for_status() + self.request_count = 0 + # get server ips + response = session.get(self.controller_url + + '/controller/get_server_ips') + response.raise_for_status() + server_ips = response.json()['server_ips'] + except requests.RequestException as e: + print(f'An error occurred: {e}') + else: + logger.info(f'Server IPs: {server_ips}') + self.servers_queue = deque(server_ips) + time.sleep(self.controller_sync_timeout) + + def select_server(self): + if not self.servers_queue: + return None + server_ip = self.servers_queue.popleft() + self.servers_queue.append(server_ip) + return server_ip + + async def redirector_handler(self, request: Request): + self.request_count += 1 + server_ip = self.select_server() + + if server_ip is None: + raise HTTPException(status_code=503, detail='No available servers') + logger.info(f'Redirecting request to {server_ip}{request.url.path}') + + path = f'http://{server_ip}:{self.app_port}{request.url.path}' + logger.info(f'Redirecting request to {path}') + return RedirectResponse(url=path) + + def serve(self): + self.app.add_api_route('/{path:path}', + self.redirector_handler, + methods=['GET', 'POST', 'PUT', 'DELETE']) + + server_fetcher_thread = threading.Thread( + target=self.sync_with_controller, daemon=True) + server_fetcher_thread.start() + + logger.info(f'Sky Server started on http://0.0.0.0:{self.port}') + logger.info('Sky Serve Redirector is ready to serve.') + + uvicorn.run(self.app, host='0.0.0.0', port=self.port) + + +if __name__ == '__main__': + # Add argparse + parser = argparse.ArgumentParser(description='SkyServe Redirector') + parser.add_argument('--task-yaml', type=str, help='Task YAML file', required=True) + parser.add_argument('--port', + '-p', + type=int, + help='Port to run the redirector on', + default=8081) + parser.add_argument('--controller-addr', + default='http://localhost:8082', + type=str, + help='Controller address (ip:port).') + args = parser.parse_args() + + service_spec = SkyServiceSpec(args.task_yaml) + redirector = SkyServeRedirector(controller_url=args.controller_addr, + service_spec=service_spec, + port=args.port) + redirector.serve() diff --git a/sky/task.py b/sky/task.py index 7e8fb37bff0..2105d0feeda 100644 --- a/sky/task.py +++ b/sky/task.py @@ -365,6 +365,9 @@ def from_yaml_config( resources = config.pop('resources', None) resources = sky.Resources.from_yaml_config(resources) + # FIXME: find a better way to exclude unused fields. + config.pop('service', None) + task.set_resources({resources}) assert not config, f'Invalid task args: {config.keys()}' return task diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index d5d13a0a1e8..caa808647e7 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -198,6 +198,9 @@ def get_task_schema(): 'type': 'number' } }, + 'service': { + 'type': 'object', + } } } From 4bb4420503c12582188bd0c9d55cde125bb36988 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Thu, 13 Jul 2023 22:54:16 +0000 Subject: [PATCH 002/223] format --- sky/serve/common.py | 3 ++- sky/serve/controller.py | 8 ++++++-- sky/serve/infra_providers.py | 4 ++-- sky/serve/redirector.py | 10 ++++++++-- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/sky/serve/common.py b/sky/serve/common.py index c3526ac70f5..5f211dc5a10 100644 --- a/sky/serve/common.py +++ b/sky/serve/common.py @@ -1,5 +1,6 @@ import yaml + class SkyServiceSpec: def __init__(self, yaml_path: str): @@ -25,7 +26,7 @@ def get_app_port(self): @property def readiness_path(self): return self._readiness_path - + @property def app_port(self): return self._app_port diff --git a/sky/serve/controller.py b/sky/serve/controller.py index ed79f583a1c..e28f7aedbc7 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -72,7 +72,10 @@ def get_server_ips(): if __name__ == '__main__': parser = argparse.ArgumentParser(description='SkyServe Server') - parser.add_argument('--task-yaml', type=str, help='Task YAML file', required=True) + parser.add_argument('--task-yaml', + type=str, + help='Task YAML file', + required=True) parser.add_argument('--port', '-p', type=int, @@ -112,5 +115,6 @@ def get_server_ips(): # ======= Controller ========= # Create a controller object and run it. - controller = Controller(infra_provider, load_balancer, autoscaler, args.port) + controller = Controller(infra_provider, load_balancer, autoscaler, + args.port) controller.run() diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 0c7c2b42d83..0ebb3b32c58 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -20,10 +20,10 @@ def total_servers(self) -> int: # Returns the total number of servers, including those under provisioning and deletion raise NotImplementedError - def scale_up(self) -> None: + def scale_up(self, n: int) -> None: raise NotImplementedError - def scale_down(self) -> None: + def scale_down(self, n: int) -> None: # TODO - Scale down must also pass in a list of servers to delete or the number of servers to delete raise NotImplementedError diff --git a/sky/serve/redirector.py b/sky/serve/redirector.py index cca583c72e7..5c3df12c62c 100644 --- a/sky/serve/redirector.py +++ b/sky/serve/redirector.py @@ -23,7 +23,10 @@ class SkyServeRedirector: - def __init__(self, controller_url: str, service_spec: SkyServiceSpec, port: int = 8081): + def __init__(self, + controller_url: str, + service_spec: SkyServiceSpec, + port: int = 8081): self.controller_url = controller_url self.port = port self.app_port = service_spec.app_port @@ -95,7 +98,10 @@ def serve(self): if __name__ == '__main__': # Add argparse parser = argparse.ArgumentParser(description='SkyServe Redirector') - parser.add_argument('--task-yaml', type=str, help='Task YAML file', required=True) + parser.add_argument('--task-yaml', + type=str, + help='Task YAML file', + required=True) parser.add_argument('--port', '-p', type=int, From d52c2b97af78f710f97d2414aa9218bb19d81d48 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Thu, 13 Jul 2023 22:56:53 +0000 Subject: [PATCH 003/223] format --- sky/serve/controller.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index e28f7aedbc7..dfe5a953764 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -10,6 +10,8 @@ import time import threading +from typing import Optional + from fastapi import FastAPI, Request import uvicorn @@ -26,7 +28,7 @@ class Controller: def __init__(self, infra_provider: InfraProvider, load_balancer: LoadBalancer, - autoscaler: Autoscaler = None, + autoscaler: Optional[Autoscaler] = None, port: int = 8082): self.port = port self.infra_provider = infra_provider From 208bdf4e43aefa815bea3098d0da49885e447190 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Fri, 14 Jul 2023 00:02:54 +0000 Subject: [PATCH 004/223] reademe --- sky/serve/README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 sky/serve/README.md diff --git a/sky/serve/README.md b/sky/serve/README.md new file mode 100644 index 00000000000..653bc88c75b --- /dev/null +++ b/sky/serve/README.md @@ -0,0 +1,25 @@ +# Sky Serve + +Serving library for SkyPilot. + +The goal of Sky Serve is simple - expose one endpoint, that redirects to serving endpoints running on different resources, regions and clouds. + +Sky Serve transparently handles load balancing, failover and autoscaling of the serving endpoints. + +## Architecture + +Sky Serve has four key components: +1. Server - The HTTP server is responsible for recieving requests and redirecting them to healthy endpoints. +2. Load balancers - spread requests across healthy endpoints according to different policies. +3. Autoscalers - scale up and down the number of serving endpoints according to different policies and handle recovery of unhealthy endpoints. +4. Infra Providers - provides a uniform interface to talk to SkyPilot. + +## Usage +** Work in progress** +```bash +# Run controller. +python -m sky.serve.controller --task-yaml examples/fastchat/api_server.yaml + +# Run redirector. +python -m sky.serve.redirector +``` \ No newline at end of file From 1fa64018d271f0fc80c4d687b58771dde91b338b Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Fri, 14 Jul 2023 00:03:36 +0000 Subject: [PATCH 005/223] update --- sky/serve/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/serve/README.md b/sky/serve/README.md index 653bc88c75b..41578e854ff 100644 --- a/sky/serve/README.md +++ b/sky/serve/README.md @@ -9,7 +9,7 @@ Sky Serve transparently handles load balancing, failover and autoscaling of the ## Architecture Sky Serve has four key components: -1. Server - The HTTP server is responsible for recieving requests and redirecting them to healthy endpoints. +1. Redirector - The HTTP server is responsible for recieving requests and redirecting them to healthy endpoints. 2. Load balancers - spread requests across healthy endpoints according to different policies. 3. Autoscalers - scale up and down the number of serving endpoints according to different policies and handle recovery of unhealthy endpoints. 4. Infra Providers - provides a uniform interface to talk to SkyPilot. From c56ae2232106cadf9e0ba9749cf92caf379e6780 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Mon, 17 Jul 2023 21:46:53 -0700 Subject: [PATCH 006/223] [SkyServe] add http server example (#2260) add http server example --- sky/serve/examples/http_server/README.md | 11 +++++++++ sky/serve/examples/http_server/server.py | 30 ++++++++++++++++++++++++ sky/serve/examples/http_server/task.yaml | 12 ++++++++++ 3 files changed, 53 insertions(+) create mode 100644 sky/serve/examples/http_server/README.md create mode 100644 sky/serve/examples/http_server/server.py create mode 100644 sky/serve/examples/http_server/task.yaml diff --git a/sky/serve/examples/http_server/README.md b/sky/serve/examples/http_server/README.md new file mode 100644 index 00000000000..2bc059cee94 --- /dev/null +++ b/sky/serve/examples/http_server/README.md @@ -0,0 +1,11 @@ +# HTTP Server example for SkyServe + +## Usage + +```bash +# Run controller. +python -m sky.serve.controller --task-yaml sky/serve/examples/http_server/task.yaml + +# Run redirector. +python -m sky.serve.redirector --task-yaml sky/serve/examples/http_server/task.yaml +``` diff --git a/sky/serve/examples/http_server/server.py b/sky/serve/examples/http_server/server.py new file mode 100644 index 00000000000..4ea616b148e --- /dev/null +++ b/sky/serve/examples/http_server/server.py @@ -0,0 +1,30 @@ +import http.server +import socketserver + +PORT = 8081 + +class MyHttpRequestHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + # Return 200 for all paths + # Therefore, readiness_probe will return 200 at path '/health' + self.send_response(200) + self.send_header('Content-type', 'text/html') + self.end_headers() + html = ''' + + + SkyPilot Test Page + + +

Hi, SkyPilot here!

+ + + ''' + self.wfile.write(bytes(html, 'utf8')) + return + +Handler = MyHttpRequestHandler + +with socketserver.TCPServer(("", PORT), Handler) as httpd: + print("serving at port", PORT) + httpd.serve_forever() diff --git a/sky/serve/examples/http_server/task.yaml b/sky/serve/examples/http_server/task.yaml new file mode 100644 index 00000000000..b82fbb29f75 --- /dev/null +++ b/sky/serve/examples/http_server/task.yaml @@ -0,0 +1,12 @@ +resources: + cloud: gcp + ports: + - 8081 + +workdir: . + +run: python3 server.py + +service: + port: 8081 + readiness_probe: /health From f28ebbbed63d2eb43e3778fe8682df7466a97cb4 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 25 Jul 2023 18:12:24 -0700 Subject: [PATCH 007/223] [SkyServe] `sky serve` CLI prototype (#2276) * Add service schema * use new serve YAML * change to qpm * change to fix node * refactor init of SkyServiceSpec * change http example to new yaml format * update default value of from_yaml_config and handle service in task * Launching successfully * use argument in controller & redirector * resolve comments * use qps instead * raise when multiple task found * change to qps * introduce constants * introduce constants & fix bugs * add sky down * add Services No existing services. without STATUS (but with #healthy replica * format * add llama2 example * add fields to service db * status with replica information * fix policy parsing bug * add auth todo * add replica status todo * change cluster name prefix and order of the column * minor fixes * reorder status * change name: controller --> control plane * change name: middleware --> controller * clean code * rename default service name * env vars * add purge and skip identity check on serve controller * upload filemounts and workdir to storage & enhance --purge --- sky/__init__.py | 4 +- sky/backends/backend_utils.py | 58 +++++ sky/backends/cloud_vm_ray_backend.py | 13 +- sky/cli.py | 187 ++++++++++++++++ sky/core.py | 5 + sky/execution.py | 205 +++++++++++++++++- sky/global_user_state.py | 140 ++++++++++++ sky/serve/__init__.py | 3 + sky/serve/autoscalers.py | 48 ++-- sky/serve/common.py | 119 ++++++++-- sky/serve/constants.py | 10 + sky/serve/{controller.py => control_plane.py} | 83 ++++--- sky/serve/examples/http_server/README.md | 11 - sky/serve/examples/http_server/server.py | 3 + sky/serve/examples/http_server/task.yaml | 7 +- sky/serve/examples/llama2/chat.py | 42 ++++ sky/serve/examples/llama2/llama2.yaml | 50 +++++ sky/serve/infra_providers.py | 47 ++-- sky/serve/load_balancers.py | 10 +- sky/serve/redirector.py | 44 ++-- sky/setup_files/setup.py | 5 +- sky/status_lib.py | 34 +++ sky/task.py | 30 ++- sky/templates/skyserve-controller.yaml.j2 | 27 +++ sky/utils/cli_utils/status_utils.py | 86 ++++++++ sky/utils/schemas.py | 53 ++++- 26 files changed, 1196 insertions(+), 128 deletions(-) create mode 100644 sky/serve/__init__.py create mode 100644 sky/serve/constants.py rename sky/serve/{controller.py => control_plane.py} (59%) delete mode 100644 sky/serve/examples/http_server/README.md create mode 100644 sky/serve/examples/llama2/chat.py create mode 100644 sky/serve/examples/llama2/llama2.yaml create mode 100644 sky/templates/skyserve-controller.yaml.j2 diff --git a/sky/__init__.py b/sky/__init__.py index 715a126e1d1..a3f0631a4a7 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -12,7 +12,7 @@ from sky import clouds from sky.clouds.service_catalog import list_accelerators from sky.dag import Dag -from sky.execution import launch, exec, spot_launch # pylint: disable=redefined-builtin +from sky.execution import launch, exec, spot_launch, serve_up, serve_down # pylint: disable=redefined-builtin from sky.resources import Resources from sky.task import Task from sky.optimizer import Optimizer, OptimizeTarget @@ -64,6 +64,8 @@ 'launch', 'exec', 'spot_launch', + 'serve_up', + 'serve_down', # core APIs 'status', 'start', diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 03eb666836b..f6f45463bff 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,4 +1,5 @@ """Util constants/functions for the backends.""" +import base64 from datetime import datetime import difflib import enum @@ -6,6 +7,7 @@ import json import os import pathlib +import pickle import re import subprocess import tempfile @@ -37,6 +39,7 @@ from sky import skypilot_config from sky import sky_logging from sky import spot as spot_lib +from sky import serve as serve_lib from sky import status_lib from sky.backends import onprem_utils from sky.skylet import constants @@ -1319,6 +1322,10 @@ def generate_cluster_name(): return f'sky-{uuid.uuid4().hex[:4]}-{get_cleaned_username()}' +def generate_service_name(): + return f'service-{uuid.uuid4().hex[:4]}' + + def get_cleaned_username() -> str: """Cleans the current username to be used as part of a cluster name. @@ -2394,6 +2401,57 @@ def _refresh_cluster(cluster_name): return kept_records +def refresh_service_status(service: Optional[str]) -> List[Dict[str, Any]]: + if service is None: + service_records = global_user_state.get_services() + else: + service_record = global_user_state.get_service_from_name(service) + if service_record is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service} does not exist.') + service_records = [service_record] + # TODO(tian): Make it run in parallel. + for record in service_records: + controller_cluster_name = record['controller_cluster_name'] + endpoint = record['endpoint'] + if not endpoint: + continue + # TODO(tian): Refactor: store ip and app_port separately. + controller_ip = endpoint.split(':')[0] + with requests.Session() as session: + try: + resp = session.get( + f'http://{controller_ip}:{serve_lib.CONTROL_PLANE_PORT}/control_plane/get_replica_nums', + timeout=5) + except requests.RequestException: + pass + else: + record.update(resp.json()) + if record['num_healthy_replicas'] > 0: + record['status'] = status_lib.ServiceStatus.RUNNING + elif record['num_unhealthy_replicas'] > 0: + record['status'] = status_lib.ServiceStatus.REPLICA_INIT + global_user_state.add_or_update_service(**record) + if service is not None: + assert record['name'] == service + try: + resp = session.get( + f'http://{controller_ip}:{serve_lib.CONTROL_PLANE_PORT}/control_plane/get_replica_info', + timeout=5) + except requests.RequestException: + pass + else: + record['replica_info'] = resp.json()['replica_info'] + decoded_info = [] + for info in record['replica_info']: + decoded_info.append({ + k: pickle.loads(base64.b64decode(v)) + for k, v in info.items() + }) + record['replica_info'] = decoded_info + return service_records + + @typing.overload def get_backend_from_handle( handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle' diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 6f98a19c1e4..3adb6c1309b 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -33,6 +33,7 @@ from sky import optimizer from sky import skypilot_config from sky import spot as spot_lib +from sky import serve as serve_lib from sky import status_lib from sky import task as task_lib from sky.data import data_utils @@ -2891,8 +2892,9 @@ def _exec_code_on_head( f'Failed to submit job {job_id}.', stderr=stdout + stderr) - logger.info('Job submitted with Job ID: ' - f'{style.BRIGHT}{job_id}{style.RESET_ALL}') + if not handle.cluster_name.startswith(serve_lib.CONTROLLER_PREFIX): + logger.info('Job submitted with Job ID: ' + f'{style.BRIGHT}{job_id}{style.RESET_ALL}') try: if not detach_run: @@ -2923,7 +2925,9 @@ def _exec_code_on_head( '\nTo view the spot job dashboard:\t' f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') - else: + elif not name.startswith(serve_lib.CONTROLLER_PREFIX): + # Skip logging for submit control plane & redirector jobs + # to controller logger.info(f'{fore.CYAN}Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' '\nTo cancel the job:\t' @@ -3032,7 +3036,8 @@ def _post_execute(self, handle: CloudVmRayResourceHandle, fore = colorama.Fore style = colorama.Style name = handle.cluster_name - if name == spot_lib.SPOT_CONTROLLER_NAME or down: + if (name == spot_lib.SPOT_CONTROLLER_NAME or down or + name.startswith(serve_lib.CONTROLLER_PREFIX)): return stop_str = ('\nTo stop the cluster:' f'\t{backend_utils.BOLD}sky stop {name}' diff --git a/sky/cli.py b/sky/cli.py index 292b0dc4155..3cf9ddc87a0 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -438,6 +438,13 @@ def _complete_cluster_name(ctx: click.Context, param: click.Parameter, return global_user_state.get_cluster_names_start_with(incomplete) +def _complete_service_name(ctx: click.Context, param: click.Parameter, + incomplete: str) -> List[str]: + """Handle shell completion for service names.""" + del ctx, param # Unused. + return global_user_state.get_service_names_start_with(incomplete) + + def _complete_storage_name(ctx: click.Context, param: click.Parameter, incomplete: str) -> List[str]: """Handle shell completion for storage names.""" @@ -3798,6 +3805,186 @@ def spot_dashboard(port: Optional[int]): click.echo('Exiting.') +@cli.group(cls=_NaturalOrderGroup) +def serve(): + """SkyServe commands CLI.""" + pass + + +@serve.command('up', cls=_DocumentedCodeCommand) +@click.argument('entrypoint', + required=True, + type=str, + **_get_shell_complete_args(_complete_file_name)) +@click.option('--service', + '-s', + default=None, + type=str, + help='A service name. Unique for each service. If not provided, ' + 'provision a new service with an autogenerated name.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') +def serve_up( + entrypoint: str, + service: Optional[str], + yes: bool, +): + """Launches a SkyServe instance. + + ENTRYPOINT must points to a valid YAML file. + + Example: + + .. code-block:: bash + + sky serve up service.yaml + """ + if service is None: + # TODO(tian): Check service name is unique. + service = backend_utils.generate_service_name() + + shell_splits = shlex.split(entrypoint) + yaml_file_provided = (len(shell_splits) == 1 and + (shell_splits[0].endswith('yaml') or + shell_splits[0].endswith('.yml'))) + if not yaml_file_provided: + click.secho('ENTRYPOINT must points to a valid YAML file.', fg='red') + return + + is_yaml = True + config: Optional[List[Dict[str, Any]]] = None + try: + with open(entrypoint, 'r') as f: + try: + config = list(yaml.safe_load_all(f)) + if config: + # FIXME(zongheng): in a chain DAG YAML it only returns the + # first section. OK for downstream but is weird. + result = config[0] + else: + result = {} + if isinstance(result, str): + invalid_reason = ( + 'cannot be parsed into a valid YAML file. ' + 'Please check syntax.') + is_yaml = False + except yaml.YAMLError as e: + if yaml_file_provided: + logger.debug(e) + invalid_reason = ('contains an invalid configuration. ' + ' Please check syntax.') + is_yaml = False + except OSError: + entry_point_path = os.path.expanduser(entrypoint) + if not os.path.exists(entry_point_path): + invalid_reason = ('does not exist. Please check if the path' + ' is correct.') + elif not os.path.isfile(entry_point_path): + invalid_reason = ('is not a file. Please check if the path' + ' is correct.') + else: + invalid_reason = ('yaml.safe_load() failed. Please check if the' + ' path is correct.') + is_yaml = False + if not is_yaml: + click.secho( + f'{entrypoint!r} looks like a yaml path but {invalid_reason}', + fg='red') + return + + click.secho('Service from YAML spec: ', fg='yellow', nl=False) + click.secho(entrypoint, bold=True) + usage_lib.messages.usage.update_user_task_yaml(entrypoint) + dag = dag_utils.load_chain_dag_from_yaml(entrypoint) + if len(dag.tasks) > 1: + click.secho('Multiple tasks found in the YAML file.', fg='red') + return + task = dag.tasks[0] + if task.service is None: + click.secho('Service section not found in the YAML file.', fg='red') + return + + if not yes: + prompt = f'Launching a new service {service}. Proceed?' + if prompt is not None: + click.confirm(prompt, default=True, abort=True, show_default=True) + + sky.serve_up(task, service, entrypoint) + + +@serve.command('status', cls=_DocumentedCodeCommand) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') +@click.argument('service', + required=False, + type=str, + **_get_shell_complete_args(_complete_service_name)) +@usage_lib.entrypoint +# pylint: disable=redefined-builtin +def serve_status(all: bool, service: Optional[str]): + service_records = core.service_status(service) + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' + f'{colorama.Style.RESET_ALL}') + status_utils.show_service_table(service_records, all) + if service is not None: + # If service not exist, we should already raise an error in + # core.service_status. + assert len(service_records) == 1, service_records + service_record = service_records[0] + if 'replica_info' not in service_record: + click.secho(f'Failed to refresh status of service: {service}.', + fg='red') + return + click.echo( + f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Replicas of {service}' + f'{colorama.Style.RESET_ALL}') + status_utils.show_replica_table(service_record['replica_info'], all) + + +@serve.command('down', cls=_DocumentedCodeCommand) +@click.argument('service', + required=True, + **_get_shell_complete_args(_complete_service_name)) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') +@click.option('--purge', + '-p', + is_flag=True, + default=False, + required=False, + help='Ignore errors (if any). ') +def serve_down( + service: str, + yes: bool, + purge: bool, +): + """Stops a SkyServe instance. + + Example: + + .. code-block:: bash + + sky serve down my-service + """ + if not yes: + prompt = f'Tearing down service {service}. Proceed?' + click.confirm(prompt, default=True, abort=True, show_default=True) + + sky.serve_down(service, purge) + + # ============================== # Sky Benchmark CLIs # ============================== diff --git a/sky/core.py b/sky/core.py index 64a3161a943..71bca86b29c 100644 --- a/sky/core.py +++ b/sky/core.py @@ -109,6 +109,11 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None, cluster_names=cluster_names) +@usage_lib.entrypoint +def service_status(service: Optional[str]) -> List[Dict[str, Any]]: + return backend_utils.refresh_service_status(service) + + @usage_lib.entrypoint def cost_report() -> List[Dict[str, Any]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. diff --git a/sky/execution.py b/sky/execution.py index 810a4980a20..e89d390852f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -25,12 +25,15 @@ import sky from sky import backends from sky import clouds +from sky import core from sky import exceptions from sky import global_user_state from sky import optimizer from sky import skypilot_config from sky import sky_logging from sky import spot +from sky import serve +from sky import status_lib from sky import task as task_lib from sky.backends import backend_utils from sky.clouds import gcp @@ -365,7 +368,9 @@ def _execute( backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: - if cluster_name != spot.SPOT_CONTROLLER_NAME: + if (cluster_name != spot.SPOT_CONTROLLER_NAME and + cluster_name is not None and + not cluster_name.startswith(serve.CONTROLLER_PREFIX)): # UX: print live clusters to make users aware (to save costs). # # Don't print if this job is launched by the spot controller, @@ -941,3 +946,201 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): raise exceptions.NotSupportedError( f'Unsupported store type: {store_type}') storage_obj.force_delete = True + + +@usage_lib.entrypoint +def serve_up( + task: 'sky.Task', + name: str, + original_yaml_path: str, +): + """Serve up a service. + + Please refer to the sky.cli.serve_up for the document. + + Args: + task: sky.Task to serve up. + name: Name of the RESTful API. + + Raises: + """ + controller_cluster_name = serve.CONTROLLER_PREFIX + name + assert task.service is not None, task + policy = task.service.policy_str() + assert len(task.resources) == 1 + requested_resources = list(task.resources)[0] + global_user_state.add_or_update_service( + name, controller_cluster_name, '', + status_lib.ServiceStatus.CONTROLLER_INIT, 0, 0, 0, policy, + requested_resources) + app_port = int(task.service.app_port) + assert len(task.resources) == 1, task + task.set_resources(list(task.resources)[0].copy(ports=[app_port])) + + # TODO(tian): Use skyserve constants. + _maybe_translate_local_file_mounts_and_sync_up(task) + + with tempfile.NamedTemporaryFile(prefix=f'serve-task-{name}-', + mode='w') as f: + task_config = task.to_yaml_config() + if 'resources' in task_config and 'spot_recovery' in task_config[ + 'resources']: + del task_config['resources']['spot_recovery'] + common_utils.dump_yaml(f.name, task_config) + remote_task_yaml_path = f'{serve.SERVICE_YAML_PREFIX}/service_{name}.yaml' + vars_to_fill = { + 'ports': [app_port, serve.CONTROL_PLANE_PORT], + 'remote_task_yaml_path': remote_task_yaml_path, + 'local_task_yaml_path': f.name, + 'is_dev': env_options.Options.IS_DEVELOPER.get(), + 'is_debug': env_options.Options.SHOW_DEBUG_INFO.get(), + 'disable_logging': env_options.Options.DISABLE_LOGGING.get(), + } + controller_yaml_path = os.path.join(serve.CONTROLLER_YAML_PREFIX, + f'{name}.yaml') + backend_utils.fill_template(serve.CONTROLLER_TEMPLATE, + vars_to_fill, + output_path=controller_yaml_path) + controller_task = task_lib.Task.from_yaml(controller_yaml_path) + assert len(controller_task.resources) == 1, controller_task + print(f'{colorama.Fore.YELLOW}' + f'Launching controller for {name}...' + f'{colorama.Style.RESET_ALL}') + + _execute( + entrypoint=controller_task, + stream_logs=True, + cluster_name=controller_cluster_name, + retry_until_up=True, + ) + + handle = global_user_state.get_handle_from_cluster_name( + controller_cluster_name) + assert isinstance(handle, backends.CloudVmRayResourceHandle) + endpoint = f'{handle.head_ip}:{task.service.app_port}' + global_user_state.add_or_update_service( + name, controller_cluster_name, endpoint, + status_lib.ServiceStatus.REPLICA_INIT, 0, 0, 0, policy, + requested_resources) + + print( + f'{colorama.Fore.YELLOW}' + 'Launching control plane process on controller...' + f'{colorama.Style.RESET_ALL}', + end='') + _execute( + entrypoint=sky.Task( + name='run-control-plane', + run='python -m sky.serve.control_plane --service-name ' + f'{name} --task-yaml {remote_task_yaml_path} ' + f'--port {serve.CONTROL_PLANE_PORT}'), + stream_logs=False, + handle=handle, + stages=[Stage.EXEC], + cluster_name=controller_cluster_name, + detach_run=True, + ) + + print( + f'{colorama.Fore.YELLOW}' + 'Launching redirector process on controller...' + f'{colorama.Style.RESET_ALL}', + end='') + _execute( + entrypoint=sky.Task( + name='run-redirector', + run='python -m sky.serve.redirector --task-yaml ' + f'{remote_task_yaml_path} --port {app_port} ' + f'--control-plane-addr http://0.0.0.0:{serve.CONTROL_PLANE_PORT}' + ), + stream_logs=False, + handle=handle, + stages=[Stage.EXEC], + cluster_name=controller_cluster_name, + detach_run=True, + ) + + print(f'{colorama.Style.BRIGHT}{colorama.Fore.CYAN}Serving at ' + f'{colorama.Style.RESET_ALL}{colorama.Fore.CYAN}' + f'{endpoint}.\n' + f'{colorama.Style.RESET_ALL}') + + +def serve_down( + name: str, + purge: bool, +): + """Teardown a service. + + Please refer to the sky.cli.serve_down for the document. + + Args: + name: Name of the service. + + Raises: + """ + service_record = global_user_state.get_service_from_name(name) + if service_record is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {name} does not exist.') + controller_cluster_name = service_record['controller_cluster_name'] + num_healthy_replicas = service_record['num_healthy_replicas'] + num_unhealthy_replicas = service_record['num_unhealthy_replicas'] + num_replicas = num_healthy_replicas + num_unhealthy_replicas + handle = global_user_state.get_handle_from_cluster_name( + controller_cluster_name) + global_user_state.set_service_status(name, + status_lib.ServiceStatus.SHUTTING_DOWN) + + try: + print( + f'{colorama.Fore.YELLOW}' + f'Stopping control plane and redirector processes on controller...' + f'{colorama.Style.RESET_ALL}') + core.cancel(controller_cluster_name, all=True) + except (ValueError, sky.exceptions.ClusterNotUpError) as e: + if purge: + logger.warning(f'Ignoring error when stopping controller: {e}') + else: + raise e + + try: + if handle is not None: + plural = '' + # TODO(tian): Change to #num replica (including failed one) + if num_replicas > 1: + plural = 's' + print(f'{colorama.Fore.YELLOW}' + f'Tearing down {num_replicas} replica{plural}...' + f'{colorama.Style.RESET_ALL}') + _execute( + entrypoint=sky.Task(name='teardown-all-replicas', + run='sky down -a -y'), + stream_logs=False, + handle=handle, + stages=[Stage.EXEC], + cluster_name=controller_cluster_name, + detach_run=False, + ) + except (RuntimeError, ValueError) as e: + if purge: + logger.warning(f'Ignoring error when cleaning controller: {e}') + else: + raise e + + try: + print(f'{colorama.Fore.YELLOW}' + 'Teardown controller...' + f'{colorama.Style.RESET_ALL}') + core.down(controller_cluster_name, purge=purge) + except (RuntimeError, ValueError) as e: + if purge: + logger.warning(f'Ignoring error when cleaning controller: {e}') + else: + raise e + + global_user_state.remove_service(name) + + print(f'{colorama.Fore.GREEN}' + f'Tear down service {name} done.' + f'{colorama.Style.RESET_ALL}') diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 37d7c9ba903..28c7717b812 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -26,6 +26,7 @@ if typing.TYPE_CHECKING: from sky import backends from sky.data import Storage + from sky import resources as resources_lib _ENABLED_CLOUDS_KEY = 'enabled_clouds' @@ -92,6 +93,18 @@ def create_table(cursor, conn): handle BLOB, last_use TEXT, status TEXT)""") + # Table for Services + cursor.execute("""\ + CREATE TABLE IF NOT EXISTS services ( + name TEXT PRIMARY KEY, + controller_cluster_name TEXT, + endpoint TEXT, + status TEXT, + num_healthy_replicas INTEGER DEFAULT 0, + num_unhealthy_replicas INTEGER DEFAULT 0, + num_failed_replicas INTEGER DEFAULT 0, + policy TEXT, + requested_resources BLOB)""") # For backward compatibility. # TODO(zhwu): Remove this function after all users have migrated to # the latest version of SkyPilot. @@ -272,6 +285,60 @@ def add_or_update_cluster(cluster_name: str, _DB.conn.commit() +def add_or_update_service( + name: str, controller_cluster_name: str, endpoint: str, + status: status_lib.ServiceStatus, num_healthy_replicas: int, + num_unhealthy_replicas: int, num_failed_replicas, policy: str, + requested_resources: Optional['resources_lib.Resources']): + _DB.cursor.execute( + 'INSERT or REPLACE INTO services' + '(name, controller_cluster_name, endpoint, status, ' + 'num_healthy_replicas, num_unhealthy_replicas, ' + 'num_failed_replicas, policy, requested_resources) ' + 'VALUES (' + # name + '?, ' + # controller_cluster_name + '?, ' + # endpoint + '?, ' + # status + '?, ' + # num_healthy_replicas + '?, ' + # num_unhealthy_replicas + '?, ' + # num_failed_replicas + '?, ' + # policy + '?, ' + # requested_resources + '?' + ')', + ( + # name + name, + # controller_cluster_name + controller_cluster_name, + # endpoint + endpoint, + # status + status.value, + # num_healthy_replicas + num_healthy_replicas, + # num_unhealthy_replicas + num_unhealthy_replicas, + # num_failed_replicas + num_failed_replicas, + # policy + policy, + # requested_resources + pickle.dumps(requested_resources), + )) + + _DB.conn.commit() + + def update_last_use(cluster_name: str): """Updates the last used command for the cluster.""" _DB.cursor.execute('UPDATE clusters SET last_use=(?) WHERE name=(?)', @@ -313,6 +380,21 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None: _DB.conn.commit() +def remove_service(service_name: str): + _DB.cursor.execute('DELETE FROM services WHERE name=(?)', (service_name,)) + _DB.conn.commit() + + +def set_service_status(service_name: str, status: status_lib.ServiceStatus): + _DB.cursor.execute('UPDATE services SET status=(?) ' + 'WHERE name=(?)', (status.value, service_name)) + count = _DB.cursor.rowcount + _DB.conn.commit() + assert count <= 1, count + if count == 0: + raise ValueError(f'Service {service_name} not found.') + + def get_handle_from_cluster_name( cluster_name: str) -> Optional['backends.ResourceHandle']: assert cluster_name is not None, 'cluster_name cannot be None' @@ -534,6 +616,33 @@ def get_cluster_from_name( return None +def get_service_from_name( + service_name: Optional[str]) -> Optional[Dict[str, Any]]: + rows = _DB.cursor.execute('SELECT * FROM services WHERE name=(?)', + (service_name,)).fetchall() + for row in rows: + # Explicitly specify the number of fields to unpack, so that + # we can add new fields to the database in the future without + # breaking the previous code. + (name, controller_cluster_name, endpoint, status, num_healthy_replicas, + num_unhealthy_replicas, num_failed_replicas, policy, + requested_resources) = row[:9] + # TODO: use namedtuple instead of dict + record = { + 'name': name, + 'controller_cluster_name': controller_cluster_name, + 'endpoint': endpoint, + 'status': status_lib.ServiceStatus[status], + 'num_healthy_replicas': num_healthy_replicas, + 'num_unhealthy_replicas': num_unhealthy_replicas, + 'num_failed_replicas': num_failed_replicas, + 'policy': policy, + 'requested_resources': pickle.loads(requested_resources), + } + return record + return None + + def get_clusters() -> List[Dict[str, Any]]: rows = _DB.cursor.execute( 'select * from clusters order by launched_at desc').fetchall() @@ -560,6 +669,31 @@ def get_clusters() -> List[Dict[str, Any]]: return records +def get_services() -> List[Dict[str, Any]]: + rows = _DB.cursor.execute('select * from services').fetchall() + records = [] + for row in rows: + (name, controller_cluster_name, endpoint, status, num_healthy_replicas, + num_unhealthy_replicas, num_failed_replicas, policy, + requested_resources) = row[:9] + # TODO: use namedtuple instead of dict + + record = { + 'name': name, + 'controller_cluster_name': controller_cluster_name, + 'endpoint': endpoint, + 'status': status_lib.ServiceStatus[status], + 'num_healthy_replicas': num_healthy_replicas, + 'num_unhealthy_replicas': num_unhealthy_replicas, + 'num_failed_replicas': num_failed_replicas, + 'policy': policy, + 'requested_resources': pickle.loads(requested_resources), + } + + records.append(record) + return records + + def get_clusters_from_history() -> List[Dict[str, Any]]: rows = _DB.cursor.execute( 'SELECT ch.cluster_hash, ch.name, ch.num_nodes, ' @@ -611,6 +745,12 @@ def get_cluster_names_start_with(starts_with: str) -> List[str]: return [row[0] for row in rows] +def get_service_names_start_with(starts_with: str) -> List[str]: + rows = _DB.cursor.execute('SELECT name FROM services WHERE name LIKE (?)', + (f'{starts_with}%',)) + return [row[0] for row in rows] + + def get_enabled_clouds() -> List[clouds.Cloud]: rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?', (_ENABLED_CLOUDS_KEY,)) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py new file mode 100644 index 00000000000..d17081db6d2 --- /dev/null +++ b/sky/serve/__init__.py @@ -0,0 +1,3 @@ +from sky.serve.constants import (CONTROLLER_PREFIX, CONTROLLER_TEMPLATE, + CONTROLLER_YAML_PREFIX, SERVICE_YAML_PREFIX, + CONTROL_PLANE_PORT) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index a9075beedfb..ecf7116888d 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -1,6 +1,8 @@ import logging import time +from typing import Optional + from sky.serve.infra_providers import InfraProvider from sky.serve.load_balancers import LoadBalancer @@ -43,7 +45,7 @@ def __init__(self, lower_threshold: int = 1, min_nodes: int = 1, **kwargs): - ''' + """ Autoscaler that scales up when the average latency of all servers is above the upper threshold and scales down when the average latency of all servers is below the lower threshold. :param args: @@ -51,7 +53,7 @@ def __init__(self, :param lower_threshold: lower threshold for latency in seconds :param min_nodes: minimum number of nodes to keep running :param kwargs: - ''' + """ super().__init__(*args, **kwargs) self.upper_threshold = upper_threshold self.lower_threshold = lower_threshold @@ -78,10 +80,10 @@ class RequestRateAutoscaler(Autoscaler): def __init__(self, *args, - query_interval: int = 10, - upper_threshold: int = 10, - lower_threshold: int = 2, min_nodes: int = 1, + max_nodes: Optional[int] = None, + upper_threshold: Optional[float] = None, + lower_threshold: Optional[float] = None, cooldown: int = 60, **kwargs): """ @@ -95,10 +97,11 @@ def __init__(self, :param kwargs: """ super().__init__(*args, **kwargs) - self.query_interval = query_interval + self.min_nodes = min_nodes + self.max_nodes = max_nodes or min_nodes + self.query_interval = 60 # Therefore thresholds represent queries per minute. self.upper_threshold = upper_threshold self.lower_threshold = lower_threshold - self.min_nodes = min_nodes self.cooldown = cooldown self.last_scale_operation = 0 # Time of last scale operation. @@ -107,11 +110,11 @@ def evaluate_scaling(self): # Check if cooldown period has passed since the last scaling operation if current_time - self.last_scale_operation < self.cooldown: + logger.info(f'Current time: {current_time}, ' + f'last scale operation: {self.last_scale_operation}, ' + f'cooldown: {self.cooldown}') logger.info( - f'Current time: {current_time}, last scale operation: {self.last_scale_operation}, cooldown: {self.cooldown}' - ) - logger.info( - f'Cooldown period has not passed since last scaling operation. Skipping scaling.' + 'Cooldown period has not passed since last scaling operation. Skipping scaling.' ) return @@ -121,27 +124,30 @@ def evaluate_scaling(self): self.load_balancer.request_timestamps.popleft() num_requests = len(self.load_balancer.request_timestamps) + num_requests = float( + num_requests) / 60 # Convert to requests per second. num_nodes = self.infra_provider.total_servers() requests_per_node = num_requests / num_nodes if num_nodes else num_requests # To account for zero case. logger.info(f'Requests per node: {requests_per_node}') - logger.info( - f'Upper threshold: {self.upper_threshold} q/node, lower threshold: {self.lower_threshold} q/node, queries per node: {requests_per_node} q/node' - ) + logger.info(f'Upper threshold: {self.upper_threshold} qps/node, ' + f'lower threshold: {self.lower_threshold} qps/node, ' + f'queries per node: {requests_per_node} qps/node') scaled = True # Bootstrap case logger.info(f'Number of nodes: {num_nodes}') - if num_nodes == 0 and requests_per_node > 0: - logger.info(f'Bootstrapping autoscaler.') + if num_nodes < self.min_nodes: + logger.info('Bootstrapping autoscaler.') self.scale_up(1) self.last_scale_operation = current_time - elif requests_per_node > self.upper_threshold: - self.scale_up(1) - self.last_scale_operation = current_time - elif requests_per_node < self.lower_threshold: + elif self.upper_threshold is not None and requests_per_node > self.upper_threshold: + if self.infra_provider.total_servers() < self.max_nodes: + self.scale_up(1) + self.last_scale_operation = current_time + elif self.lower_threshold is not None and requests_per_node < self.lower_threshold: if self.infra_provider.total_servers() > self.min_nodes: self.scale_down(1) self.last_scale_operation = current_time else: - logger.info(f'No scaling needed.') + logger.info('No scaling needed.') diff --git a/sky/serve/common.py b/sky/serve/common.py index 5f211dc5a10..f5e66fdf4e1 100644 --- a/sky/serve/common.py +++ b/sky/serve/common.py @@ -1,32 +1,115 @@ -import yaml +from typing import Optional, Dict, Any + +from sky.backends import backend_utils +from sky.utils import schemas +from sky.utils import ux_utils class SkyServiceSpec: - def __init__(self, yaml_path: str): - with open(yaml_path, 'r') as f: - self.task = yaml.safe_load(f) - if 'service' not in self.task: - raise ValueError('Task YAML must have a "service" section') - if 'port' not in self.task['service']: - raise ValueError('Task YAML must have a "port" section') - if 'readiness_probe' not in self.task['service']: - raise ValueError('Task YAML must have a "readiness_probe" section') - self._readiness_path = self.get_readiness_path() - self._app_port = self.get_app_port() - - def get_readiness_path(self): + def __init__( + self, + readiness_path: str, + readiness_timeout: int, + app_port: int, + min_replica: int, + max_replica: Optional[int] = None, + qps_upper_threshold: Optional[float] = None, + qps_lower_threshold: Optional[float] = None, + ): + if max_replica is not None and max_replica < min_replica: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'max_replica must be greater than or equal to min_replica') # TODO: check if the path is valid - return f':{self.task["service"]["port"]}{self.task["service"]["readiness_probe"]}' - - def get_app_port(self): + self._readiness_path = f':{app_port}{readiness_path}' + self._readiness_timeout = readiness_timeout # TODO: check if the port is valid - return f'{self.task["service"]["port"]}' + self._app_port = str(app_port) + self._min_replica = min_replica + self._max_replica = max_replica + self._qps_upper_threshold = qps_upper_threshold + self._qps_lower_threshold = qps_lower_threshold + + @classmethod + def from_yaml_config(cls, config: Optional[Dict[str, Any]]): + if config is None: + return None + + backend_utils.validate_schema(config, schemas.get_service_schema(), + 'Invalid service YAML:') + + service_config = {} + service_config['readiness_path'] = config['readiness_probe']['path'] + service_config['readiness_timeout'] = config['readiness_probe'][ + 'readiness_timeout'] + service_config['app_port'] = config['port'] + service_config['min_replica'] = config['replica_policy']['min_replica'] + service_config['max_replica'] = config['replica_policy'].get( + 'max_replica', None) + service_config['qps_upper_threshold'] = config['replica_policy'].get( + 'qps_upper_threshold', None) + service_config['qps_lower_threshold'] = config['replica_policy'].get( + 'qps_lower_threshold', None) + + return SkyServiceSpec(**service_config) + + def to_yaml_config(self): + replica_policy = {} + + def add_if_not_none(key, value, no_empty: bool = False): + if no_empty and not value: + return + if value is not None: + replica_policy[key] = value + + add_if_not_none('min_replica', self.min_replica) + add_if_not_none('max_replica', self.max_replica) + add_if_not_none('qps_upper_threshold', self.qps_upper_threshold) + add_if_not_none('qps_lower_threshold', self.qps_lower_threshold) + + return { + 'port': int(self.app_port), + 'readiness_probe': { + 'path': self.readiness_path[len(f':{self.app_port}'):], + 'readiness_timeout': self.readiness_timeout, + }, + 'replica_policy': replica_policy, + } + + def policy_str(self): + if self.max_replica == self.min_replica or self.max_replica is None: + plural = '' + if self.min_replica > 1: + plural = 'S' + return f'FIXED NODE{plural}: {self.min_replica}' + # TODO(tian): Refactor to contain more information + return f'AUTOSCALE [{self.min_replica}, {self.max_replica}]' @property def readiness_path(self): return self._readiness_path + @property + def readiness_timeout(self): + return self._readiness_timeout + @property def app_port(self): return self._app_port + + @property + def min_replica(self): + return self._min_replica + + @property + def max_replica(self): + return self._max_replica + + @property + def qps_upper_threshold(self): + return self._qps_upper_threshold + + @property + def qps_lower_threshold(self): + return self._qps_lower_threshold diff --git a/sky/serve/constants.py b/sky/serve/constants.py new file mode 100644 index 00000000000..e8ac94662c3 --- /dev/null +++ b/sky/serve/constants.py @@ -0,0 +1,10 @@ +"""Constants used for SkyServe.""" + +CONTROLLER_PREFIX = 'controller-' + +CONTROLLER_TEMPLATE = 'skyserve-controller.yaml.j2' +CONTROLLER_YAML_PREFIX = '~/.sky/serve' + +SERVICE_YAML_PREFIX = '~/.sky/service' + +CONTROL_PLANE_PORT = 31001 diff --git a/sky/serve/controller.py b/sky/serve/control_plane.py similarity index 59% rename from sky/serve/controller.py rename to sky/serve/control_plane.py index dfe5a953764..8b710646a96 100644 --- a/sky/serve/controller.py +++ b/sky/serve/control_plane.py @@ -9,6 +9,7 @@ import time import threading +import yaml from typing import Optional @@ -23,13 +24,13 @@ logger = logging.getLogger(__name__) -class Controller: +class ControlPlane: def __init__(self, + port: int, infra_provider: InfraProvider, load_balancer: LoadBalancer, - autoscaler: Optional[Autoscaler] = None, - port: int = 8082): + autoscaler: Optional[Autoscaler] = None): self.port = port self.infra_provider = infra_provider self.load_balancer = load_balancer @@ -43,9 +44,10 @@ def server_fetcher(self): self.load_balancer.probe_endpoints(server_ips) time.sleep(10) + # TODO(tian): Authentication!!! def run(self): - @self.app.post('/controller/increment_request_count') + @self.app.post('/control_plane/increment_request_count') async def increment_request_count(request: Request): # await request request_data = await request.json() @@ -55,10 +57,26 @@ async def increment_request_count(request: Request): self.load_balancer.increment_request_count(count=count) return {'message': 'Success'} - @self.app.get('/controller/get_server_ips') + @self.app.get('/control_plane/get_server_ips') def get_server_ips(): return {'server_ips': list(self.load_balancer.servers_queue)} + @self.app.get('/control_plane/get_replica_info') + def get_replica_info(): + return {'replica_info': self.infra_provider.get_replica_info()} + + @self.app.get('/control_plane/get_replica_nums') + def get_replica_nums(): + return { + 'num_healthy_replicas': len(self.load_balancer.available_servers + ), + 'num_unhealthy_replicas': + self.infra_provider.total_servers() - + len(self.load_balancer.available_servers), + # TODO(tian): Detect error replicas + 'num_failed_replicas': 0 + } + # Run server_monitor and autoscaler.monitor (if autoscaler is defined) in separate threads in the background. This should not block the main thread. server_fetcher_thread = threading.Thread(target=self.server_fetcher, daemon=True) @@ -73,7 +91,11 @@ def get_server_ips(): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='SkyServe Server') + parser = argparse.ArgumentParser(description='SkyServe Control Plane') + parser.add_argument('--service-name', + type=str, + help='Name of the service', + required=True) parser.add_argument('--task-yaml', type=str, help='Task YAML file', @@ -81,24 +103,26 @@ def get_server_ips(): parser.add_argument('--port', '-p', type=int, - help='Port to run the controller', - default=8082) - parser.add_argument('--min-nodes', - type=int, - default=1, - help='Minimum nodes to keep running') + help='Port to run the control plane', + required=True) args = parser.parse_args() # ======= Infra Provider ========= # infra_provider = DummyInfraProvider() - infra_provider = SkyPilotInfraProvider(args.task_yaml) + infra_provider = SkyPilotInfraProvider(args.task_yaml, args.service_name) # ======= Load Balancer ========= - service_spec = SkyServiceSpec(args.task_yaml) + with open(args.task_yaml, 'r') as f: + task = yaml.safe_load(f) + if 'service' not in task: + raise ValueError('Task YAML must have a "service" section') + service_config = task['service'] + service_spec = SkyServiceSpec.from_yaml_config(service_config) # Select the load balancing policy: RoundRobinLoadBalancer or LeastLoadedLoadBalancer load_balancer = RoundRobinLoadBalancer( infra_provider=infra_provider, - endpoint_path=service_spec.readiness_path) + endpoint_path=service_spec.readiness_path, + readiness_timeout=service_spec.readiness_timeout) # load_balancer = LeastLoadedLoadBalancer(n=5) # autoscaler = LatencyThresholdAutoscaler(load_balancer, # upper_threshold=0.5, # 500ms @@ -106,17 +130,18 @@ def get_server_ips(): # ======= Autoscaler ========= # Create an autoscaler with the RequestRateAutoscaler policy. Thresholds are defined as requests per node in the defined interval. - autoscaler = RequestRateAutoscaler(infra_provider, - load_balancer, - frequency=5, - query_interval=60, - lower_threshold=0, - upper_threshold=1, - min_nodes=args.min_nodes, - cooldown=60) - - # ======= Controller ========= - # Create a controller object and run it. - controller = Controller(infra_provider, load_balancer, autoscaler, - args.port) - controller.run() + autoscaler = RequestRateAutoscaler( + infra_provider, + load_balancer, + frequency=5, + min_nodes=service_spec.min_replica, + max_nodes=service_spec.max_replica, + upper_threshold=service_spec.qps_upper_threshold, + lower_threshold=service_spec.qps_lower_threshold, + cooldown=60) + + # ======= ControlPlane ========= + # Create a control plane object and run it. + control_plane = ControlPlane(args.port, infra_provider, load_balancer, + autoscaler) + control_plane.run() diff --git a/sky/serve/examples/http_server/README.md b/sky/serve/examples/http_server/README.md deleted file mode 100644 index 2bc059cee94..00000000000 --- a/sky/serve/examples/http_server/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# HTTP Server example for SkyServe - -## Usage - -```bash -# Run controller. -python -m sky.serve.controller --task-yaml sky/serve/examples/http_server/task.yaml - -# Run redirector. -python -m sky.serve.redirector --task-yaml sky/serve/examples/http_server/task.yaml -``` diff --git a/sky/serve/examples/http_server/server.py b/sky/serve/examples/http_server/server.py index 4ea616b148e..303b117d26d 100644 --- a/sky/serve/examples/http_server/server.py +++ b/sky/serve/examples/http_server/server.py @@ -3,7 +3,9 @@ PORT = 8081 + class MyHttpRequestHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): # Return 200 for all paths # Therefore, readiness_probe will return 200 at path '/health' @@ -23,6 +25,7 @@ def do_GET(self): self.wfile.write(bytes(html, 'utf8')) return + Handler = MyHttpRequestHandler with socketserver.TCPServer(("", PORT), Handler) as httpd: diff --git a/sky/serve/examples/http_server/task.yaml b/sky/serve/examples/http_server/task.yaml index b82fbb29f75..d0fe866f259 100644 --- a/sky/serve/examples/http_server/task.yaml +++ b/sky/serve/examples/http_server/task.yaml @@ -9,4 +9,9 @@ run: python3 server.py service: port: 8081 - readiness_probe: /health + readiness_probe: + path: /health + readiness_timeout: 12000 + replica_policy: + min_replica: 1 + max_replica: 1 diff --git a/sky/serve/examples/llama2/chat.py b/sky/serve/examples/llama2/chat.py new file mode 100644 index 00000000000..2f450479851 --- /dev/null +++ b/sky/serve/examples/llama2/chat.py @@ -0,0 +1,42 @@ +import requests +import json +import openai + +stream = True +model = "Llama-2-7b-chat-hf" +init_prompt = "You are a helful assistant." +history = [{"role": "system", "content": init_prompt}] +endpoint = input("Endpoint: ") +url = f"http://{endpoint}/v1/chat/completions" +openai.api_base = f"http://{endpoint}/v1" +openai.api_key = "placeholder" + +try: + while True: + user_input = input("[User] ") + history.append({"role": "user", "content": user_input}) + if stream: + resp = openai.ChatCompletion.create(model=model, + messages=history, + stream=True) + print("[Chatbot]", end="", flush=True) + tot = "" + for i in resp: + dlt = i["choices"][0]["delta"] + if "content" not in dlt: + continue + print(dlt["content"], end="", flush=True) + tot += dlt["content"] + print() + history.append({"role": "assistant", "content": tot}) + else: + resp = requests.post(url, + data=json.dumps({ + "model": model, + "messages": history + })) + msg = resp.json()["choices"][0]["message"] + print("[Chatbot]" + msg["content"]) + history.append(msg) +except KeyboardInterrupt: + print("\nBye!") diff --git a/sky/serve/examples/llama2/llama2.yaml b/sky/serve/examples/llama2/llama2.yaml new file mode 100644 index 00000000000..a1317e33509 --- /dev/null +++ b/sky/serve/examples/llama2/llama2.yaml @@ -0,0 +1,50 @@ +resources: + cloud: gcp + memory: 32+ + accelerators: T4:1 + disk_size: 1024 + disk_tier: high + +service: + port: 8087 + readiness_probe: + path: /v1/models + readiness_timeout: 1200 + replica_policy: + min_replica: 2 + +envs: + MODEL_SIZE: 7 + HF_TOKEN: # TODO: Replace with huggingface token + +setup: | + conda activate chatbot + if [ $? -ne 0 ]; then + conda create -n chatbot python=3.9 -y + conda activate chatbot + fi + + # Install dependencies + pip install git+https://github.com/lm-sys/FastChat.git + # Need the latest transformers to support 70B model + pip install git+https://github.com/huggingface/transformers.git + + python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + +run: | + conda activate chatbot + + echo 'Starting controller...' + python -u -m fastchat.serve.controller --host 0.0.0.0 > ~/controller.log 2>&1 & + sleep 10 + echo 'Starting model worker...' + python -u -m fastchat.serve.model_worker --host 0.0.0.0 \ + --model-path meta-llama/Llama-2-${MODEL_SIZE}b-chat-hf \ + --num-gpus $SKYPILOT_NUM_GPUS_PER_NODE 2>&1 \ + | tee model_worker.log & + + echo 'Waiting for model worker to start...' + while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting openai api server server...' + python -u -m fastchat.serve.openai_api_server --host 0.0.0.0 --port 8087 | tee ~/openai_api_server.log diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 0ebb3b32c58..0a4c931e678 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -1,6 +1,8 @@ import logging from typing import List import time +import pickle +import base64 import sky from sky.backends import backend_utils @@ -77,21 +79,21 @@ def terminate_servers(self, unhealthy_servers: List[str]): class SkyPilotInfraProvider(InfraProvider): - CLUSTER_NAME_PREFIX = 'skyserve-' - def __init__(self, task_yaml_path: str): + def __init__(self, task_yaml_path: str, cluster_name_prefix: str): self.task_yaml_path = task_yaml_path + self.cluster_name_prefix = cluster_name_prefix + '-' self.id_counter = self._get_id_start() def _get_id_start(self): - ''' + """ Returns the id to start from when creating a new cluster - ''' + """ clusters = sky.global_user_state.get_clusters() # Filter out clusters that don't have the prefix clusters = [ cluster for cluster in clusters - if self.CLUSTER_NAME_PREFIX in cluster['name'] + if self.cluster_name_prefix in cluster['name'] ] # Get the greatest id max_id = 0 @@ -108,7 +110,7 @@ def _get_ip_clusname_map(self): ip_clusname_map = {} for cluster in clusters: name = cluster['name'] - if self.CLUSTER_NAME_PREFIX in name: + if self.cluster_name_prefix in name: handle = cluster['handle'] try: # Get the head node ip @@ -121,6 +123,23 @@ def _get_ip_clusname_map(self): continue return ip_clusname_map + def get_replica_info(self): + clusters = sky.global_user_state.get_clusters() + infos = [] + for cluster in clusters: + if self.cluster_name_prefix in cluster['name']: + info = { + 'name': cluster['name'], + 'handle': cluster['handle'], + 'status': cluster['status'], + } + info = { + k: base64.b64encode(pickle.dumps(v)).decode('utf-8') + for k, v in info.items() + } + infos.append(info) + return infos + def _get_server_ips(self): return list(self._get_ip_clusname_map().keys()) @@ -130,7 +149,7 @@ def _return_total_servers(self): # FIXME - this is a hack to get around. should implement a better filtering mechanism clusters = [ cluster for cluster in clusters - if self.CLUSTER_NAME_PREFIX in cluster['name'] + if self.cluster_name_prefix in cluster['name'] ] return len(clusters) @@ -138,10 +157,12 @@ def _scale_up(self, n): # Launch n new clusters task = sky.Task.from_yaml(self.task_yaml_path) for i in range(0, n): - cluster_name = f'{self.CLUSTER_NAME_PREFIX}{self.id_counter}' + cluster_name = f'{self.cluster_name_prefix}{self.id_counter}' logger.info(f'Creating SkyPilot cluster {cluster_name}') - sky.launch(task, cluster_name=cluster_name, - detach_run=True) # TODO - make the launch parallel + sky.launch(task, + cluster_name=cluster_name, + detach_run=True, + retry_until_up=True) # TODO - make the launch parallel self.id_counter += 1 def _scale_down(self, n): @@ -151,7 +172,7 @@ def _scale_down(self, n): # Filter out clusters that don't have the prefix clusters = [ cluster for cluster in clusters - if self.CLUSTER_NAME_PREFIX in cluster['name'] + if self.cluster_name_prefix in cluster['name'] ] num_clusters = len(clusters) if num_clusters > 0: @@ -194,10 +215,8 @@ def terminate_servers(self, unhealthy_servers: List[str]): name = ip_to_name_map[endpoint_url] if endpoint_url in unhealthy_servers: logger.info(f'Deleting SkyPilot cluster {name}') - # Run sky.down in a daemon thread so that it doesn't block the main thread threading.Thread(target=sky.down, args=(name,), kwargs={ 'purge': True - }, - daemon=True).start() + }).start() diff --git a/sky/serve/load_balancers.py b/sky/serve/load_balancers.py index 4c0c95c6ff7..f1d1b464a76 100644 --- a/sky/serve/load_balancers.py +++ b/sky/serve/load_balancers.py @@ -12,12 +12,17 @@ class LoadBalancer: - def __init__(self, infra_provider, endpoint_path, post_data=None): + def __init__(self, + infra_provider, + endpoint_path, + readiness_timeout, + post_data=None): self.available_servers = [] self.request_count = 0 self.request_timestamps = deque() self.infra_provider = infra_provider self.endpoint_path = endpoint_path + self.readiness_timeout = readiness_timeout self.post_data = post_data def increment_request_count(self, count=1): @@ -37,7 +42,6 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.servers_queue = deque() self.first_unhealthy_time = {} - self.timeout = 18000 logger.info(f'Endpoint path: {self.endpoint_path}') def probe_endpoints(self, endpoint_ips): @@ -101,7 +105,7 @@ def probe_endpoint(endpoint_ip): if server not in self.first_unhealthy_time: self.first_unhealthy_time[server] = time.time() elif time.time() - self.first_unhealthy_time[ - server] > self.timeout: # cooldown before terminating a dead server to avoid hysterisis + server] > self.readiness_timeout: # cooldown before terminating a dead server to avoid hysterisis servers_to_terminate.append(server) self.infra_provider.terminate_servers(servers_to_terminate) diff --git a/sky/serve/redirector.py b/sky/serve/redirector.py index 5c3df12c62c..95b364f184a 100644 --- a/sky/serve/redirector.py +++ b/sky/serve/redirector.py @@ -1,6 +1,8 @@ import time import logging +import yaml from collections import deque +from typing import List, Deque from sky.serve.common import SkyServiceSpec @@ -24,34 +26,34 @@ class SkyServeRedirector: def __init__(self, - controller_url: str, + control_plane_url: str, service_spec: SkyServiceSpec, port: int = 8081): - self.controller_url = controller_url + self.control_plane_url = control_plane_url self.port = port self.app_port = service_spec.app_port - self.server_ips = [] - self.servers_queue = deque() + self.server_ips: List[str] = [] + self.servers_queue: Deque[str] = deque() self.app = FastAPI() self.request_count = 0 - self.controller_sync_timeout = 20 + self.control_plane_sync_timeout = 20 - def sync_with_controller(self): + def sync_with_control_plane(self): while True: server_ips = [] with requests.Session() as session: try: # send request count response = session.post( - self.controller_url + - '/controller/increment_request_count', + self.control_plane_url + + '/control_plane/increment_request_count', json={'counts': self.request_count}, timeout=5) response.raise_for_status() self.request_count = 0 # get server ips - response = session.get(self.controller_url + - '/controller/get_server_ips') + response = session.get(self.control_plane_url + + '/control_plane/get_server_ips') response.raise_for_status() server_ips = response.json()['server_ips'] except requests.RequestException as e: @@ -59,7 +61,7 @@ def sync_with_controller(self): else: logger.info(f'Server IPs: {server_ips}') self.servers_queue = deque(server_ips) - time.sleep(self.controller_sync_timeout) + time.sleep(self.control_plane_sync_timeout) def select_server(self): if not self.servers_queue: @@ -86,7 +88,7 @@ def serve(self): methods=['GET', 'POST', 'PUT', 'DELETE']) server_fetcher_thread = threading.Thread( - target=self.sync_with_controller, daemon=True) + target=self.sync_with_control_plane, daemon=True) server_fetcher_thread.start() logger.info(f'Sky Server started on http://0.0.0.0:{self.port}') @@ -106,15 +108,21 @@ def serve(self): '-p', type=int, help='Port to run the redirector on', - default=8081) - parser.add_argument('--controller-addr', - default='http://localhost:8082', + required=True) + parser.add_argument('--control-plane-addr', type=str, - help='Controller address (ip:port).') + help='Control plane address (ip:port).', + required=True) args = parser.parse_args() - service_spec = SkyServiceSpec(args.task_yaml) - redirector = SkyServeRedirector(controller_url=args.controller_addr, + with open(args.task_yaml, 'r') as f: + task = yaml.safe_load(f) + if 'service' not in task: + raise ValueError('Task YAML must have a "service" section') + service_config = task['service'] + service_spec = SkyServiceSpec.from_yaml_config(service_config) + + redirector = SkyServeRedirector(control_plane_url=args.control_plane_addr, service_spec=service_spec, port=args.port) redirector.serve() diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index acfaafacaae..c28fc671655 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -112,7 +112,10 @@ def parse_readme(readme: str) -> str: 'pulp', # Ray job has an issue with pydantic>2.0.0, due to API changes of pydantic. See # https://github.com/ray-project/ray/issues/36990 - 'pydantic<2.0' + 'pydantic<2.0', + # Required by the SkyServe library + 'uvicorn', + 'fastapi' ] # NOTE: Change the templates/spot-controller.yaml.j2 file if any of the diff --git a/sky/status_lib.py b/sky/status_lib.py index ae9a00c84de..ff977b009e2 100644 --- a/sky/status_lib.py +++ b/sky/status_lib.py @@ -49,3 +49,37 @@ class StorageStatus(enum.Enum): # Finished uploading, in terminal state READY = 'READY' + + +class ServiceStatus(enum.Enum): + """Service status as recorded in table 'services'.""" + + # Middleware is initializing + CONTROLLER_INIT = 'CONTROLLER_INIT' + + # Replica is initializing + REPLICA_INIT = 'REPLICA_INIT' + + # At least one replica is ready + RUNNING = 'RUNNING' + + # Service is being stopped + SHUTTING_DOWN = 'SHUTTING_DOWN' + + # At least one replica is failed + FAILED = 'FAILED' + + def colored_str(self): + color = _SERVICE_STATUS_TO_COLOR[self] + return f'{color}{self.value}{colorama.Style.RESET_ALL}' + + +_SERVICE_STATUS_TO_COLOR = { + ServiceStatus.CONTROLLER_INIT: colorama.Fore.BLUE, + ServiceStatus.REPLICA_INIT: colorama.Fore.BLUE, + ServiceStatus.RUNNING: colorama.Fore.GREEN, + ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, + ServiceStatus.FAILED: colorama.Fore.RED, +} + +# TODO(tian): Add status for replicas to distinguish 'skypilot UP' and 'health probe succeeded' diff --git a/sky/task.py b/sky/task.py index 2105d0feeda..150113da5f1 100644 --- a/sky/task.py +++ b/sky/task.py @@ -15,6 +15,7 @@ from sky.backends import backend_utils from sky.data import storage as storage_lib from sky.data import data_utils +from sky.serve import common from sky.skylet import constants from sky.utils import schemas from sky.utils import ux_utils @@ -194,6 +195,7 @@ def __init__( self.estimated_outputs_size_gigabytes = None # Default to CPUNode self.resources = {sky.Resources()} + self._service = None self.time_estimator_func: Optional[Callable[['sky.Resources'], int]] = None self.file_mounts: Optional[Dict[str, str]] = None @@ -365,10 +367,12 @@ def from_yaml_config( resources = config.pop('resources', None) resources = sky.Resources.from_yaml_config(resources) - # FIXME: find a better way to exclude unused fields. - config.pop('service', None) - task.set_resources({resources}) + + service = config.pop('service', None) + service = common.SkyServiceSpec.from_yaml_config(service) + task.set_service(service) + assert not config, f'Invalid task args: {config.keys()}' return task @@ -528,6 +532,22 @@ def set_resources( def get_resources(self): return self.resources + @property + def service(self) -> Optional[common.SkyServiceSpec]: + return self._service + + def set_service(self, service: Optional[common.SkyServiceSpec]) -> 'Task': + """Sets the service spec for this task. + + Args: + service: a SkyServiceSpec object. + + Returns: + self: The current task, with service set. + """ + self._service = service + return self + def set_time_estimator(self, func: Callable[['sky.Resources'], int]) -> 'Task': """Sets a func mapping resources to estimated time (secs). @@ -884,6 +904,10 @@ def add_if_not_none(key, value, no_empty: bool = False): assert len(self.resources) == 1 resources = list(self.resources)[0] add_if_not_none('resources', resources.to_yaml_config()) + + if self.service is not None: + add_if_not_none('service', self.service.to_yaml_config()) + add_if_not_none('num_nodes', self.num_nodes) if self.inputs is not None: diff --git a/sky/templates/skyserve-controller.yaml.j2 b/sky/templates/skyserve-controller.yaml.j2 new file mode 100644 index 00000000000..a750c01e899 --- /dev/null +++ b/sky/templates/skyserve-controller.yaml.j2 @@ -0,0 +1,27 @@ +resources: + cloud: gcp + disk_size: 100 + ports: +{%- for port in ports %} + - {{port}} +{%- endfor %} + +# {% if workdir is not none %} +# workdir: {{workdir}} +# {% endif %} + +file_mounts: + {{remote_task_yaml_path}}: {{local_task_yaml_path}} + +envs: + # skip cloud identity check for serve controller to avoid the overhead. + SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK: 1 +{% if is_dev %} + SKYPILOT_DEV: 1 +{% endif %} +{% if is_debug %} + SKYPILOT_DEBUG: 1 +{% endif %} +{% if disable_logging %} + SKYPILOT_DISABLE_USAGE_COLLECTION: 1 +{% endif %} diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 8ec8222c599..331494b672e 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -19,6 +19,9 @@ _ClusterRecord = Dict[str, Any] # A record returned by core.cost_report(); see its docstr for all fields. _ClusterCostReportRecord = Dict[str, Any] +# A record in global_user_state's 'services' table. +_ServiceRecord = Dict[str, Any] +_ReplicaRecord = Dict[str, Any] def truncate_long_string(s: str, max_length: int = 35) -> str: @@ -107,6 +110,68 @@ def show_status_table(cluster_records: List[_ClusterRecord], return num_pending_autostop +def show_service_table(service_records: List[_ServiceRecord], show_all: bool): + status_columns = [ + StatusColumn('NAME', _get_name), + StatusColumn('CONTROLLER_CLUSTER_NAME', + _get_controller_cluster_name, + show_by_default=False), + StatusColumn('ENDPOINT', _get_endpoint), + StatusColumn('#HEALTHY_REPLICAS', _get_healthy_replicas), + StatusColumn('#UNHEALTHY_REPLICAS', _get_unhealthy_replicas), + # TODO(tian): After we have a better way to detect failed replicas + # StatusColumn('#FAILED_REPLICAS', _get_failed_replicas), + StatusColumn('STATUS', _get_service_status_colored), + StatusColumn('POLICY', _get_policy, show_by_default=False), + StatusColumn('REQUESTED_RESOURCES', + _get_requested_resources, + show_by_default=False), + ] + + columns = [] + for status_column in status_columns: + if status_column.show_by_default or show_all: + columns.append(status_column.name) + service_table = log_utils.create_table(columns) + for record in service_records: + row = [] + for status_column in status_columns: + if status_column.show_by_default or show_all: + row.append(status_column.calc(record)) + service_table.add_row(row) + if service_records: + click.echo(service_table) + else: + click.echo('No existing services.') + + +def show_replica_table(replica_records: List[_ReplicaRecord], show_all: bool): + status_columns = [ + StatusColumn('NAME', _get_name), + StatusColumn('RESOURCES', + _get_resources, + trunc_length=70 if not show_all else 0), + StatusColumn('REGION', _get_region), + StatusColumn('STATUS', _get_status_colored), + ] + + columns = [] + for status_column in status_columns: + if status_column.show_by_default or show_all: + columns.append(status_column.name) + replica_table = log_utils.create_table(columns) + for record in replica_records: + row = [] + for status_column in status_columns: + if status_column.show_by_default or show_all: + row.append(status_column.calc(record)) + replica_table.add_row(row) + if replica_records: + click.echo(replica_table) + else: + click.echo('No existing replicas.') + + def get_total_cost_of_displayed_records( cluster_records: List[_ClusterCostReportRecord], display_all: bool): """Compute total cost of records to be displayed in cost report.""" @@ -307,6 +372,27 @@ def show_local_status_table(local_clusters: List[str]): _get_command = (lambda cluster_record: cluster_record['last_use']) _get_duration = (lambda cluster_record: log_utils.readable_time_duration( 0, cluster_record['duration'], absolute=True)) +_get_controller_cluster_name = ( + lambda service_record: service_record['controller_cluster_name']) +_get_endpoint = (lambda service_record: service_record['endpoint']) +_get_healthy_replicas = ( + lambda service_record: service_record['num_healthy_replicas']) +_get_unhealthy_replicas = ( + lambda service_record: service_record['num_unhealthy_replicas']) +_get_failed_replicas = ( + lambda service_record: service_record['num_failed_replicas']) +_get_policy = (lambda service_record: service_record['policy']) +_get_requested_resources = ( + lambda service_record: service_record['requested_resources']) + + +def _get_service_status( + service_record: _ServiceRecord) -> status_lib.ServiceStatus: + return service_record['status'] + + +def _get_service_status_colored(service_record: _ServiceRecord) -> str: + return _get_service_status(service_record).colored_str() def _get_status(cluster_record: _ClusterRecord) -> status_lib.ClusterStatus: diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index caa808647e7..c44fd07f941 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -137,6 +137,52 @@ def get_storage_schema(): } +def get_service_schema(): + return { + '$schema': 'http://json-schema.org/draft-07/schema#', + 'type': 'object', + 'required': ['port', 'readiness_probe', 'replica_policy'], + 'additionalProperties': False, + 'properties': { + 'port': { + 'type': 'integer', + }, + 'readiness_probe': { + 'type': 'object', + 'required': ['path', 'readiness_timeout'], + 'additionalProperties': False, + 'properties': { + 'path': { + 'type': 'string', + }, + 'readiness_timeout': { + 'type': 'number', + }, + } + }, + 'replica_policy': { + 'type': 'object', + 'required': ['min_replica'], + 'additionalProperties': False, + 'properties': { + 'min_replica': { + 'type': 'integer', + }, + 'max_replica': { + 'type': 'integer', + }, + 'qps_upper_threshold': { + 'type': 'number', + }, + 'qps_lower_threshold': { + 'type': 'number', + }, + } + } + } + } + + def get_task_schema(): return { '$schema': 'https://json-schema.org/draft/2020-12/schema', @@ -164,6 +210,10 @@ def get_task_schema(): 'file_mounts': { 'type': 'object', }, + # service config is validated separately using SERVICE_SCHEMA + 'service': { + 'type': 'object', + }, 'setup': { 'type': 'string', }, @@ -197,9 +247,6 @@ def get_task_schema(): 'additionalProperties': { 'type': 'number' } - }, - 'service': { - 'type': 'object', } } } From 8fa432332ff53afc045c79ecd47f8365ff398fc0 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Fri, 4 Aug 2023 11:42:36 -0700 Subject: [PATCH 008/223] [SkyServe] Refactoring, Introducing multiprocess for provisioning and `sky serve logs` prototype (#2311) * introducing multiprocessing prototype * add run env to controller & redirector * reefactor and format * add control-plane and redirector logs * minor * minor * Refactor: move to infra provider * Refactor: move load balancer to redirector * refactor, add more logging * add replica status * resolve some TODOs * add post data feature * rename, format * add error message handling * bug fix & logging * fix a bug in continuous unhealthy * add error when user port is same with control plane * fix None post_data bug * add stable diffusion example * remove response body when code == 200 * add some TODOs and change RUNNING to READY * add failed status * add TODO for return failed replica info * fix sky serve status --help error * add console help messages * remove redundant stable diffusion setup files * rename healthy_replica --> ready_replica * adopt advice from code review * rename to service_name * adopt advice from comment --- sky/backends/backend_utils.py | 69 +-- sky/cli.py | 128 ++++- sky/core.py | 4 +- sky/execution.py | 148 ++++-- sky/global_user_state.py | 20 +- sky/serve/__init__.py | 2 + sky/serve/autoscalers.py | 207 ++++---- sky/serve/common.py | 115 ---- sky/serve/constants.py | 1 + sky/serve/control_plane.py | 147 +++--- sky/serve/examples/http_server/task.yaml | 2 +- .../examples/stable_diffusion_service.yaml | 34 ++ sky/serve/infra_providers.py | 496 ++++++++++++------ sky/serve/load_balancers.py | 187 ++----- sky/serve/redirector.py | 137 +++-- sky/serve/service_spec.py | 160 ++++++ sky/status_lib.py | 34 +- sky/task.py | 11 +- sky/utils/cli_utils/status_utils.py | 9 +- sky/utils/schemas.py | 7 + 20 files changed, 1130 insertions(+), 788 deletions(-) delete mode 100644 sky/serve/common.py create mode 100644 sky/serve/examples/stable_diffusion_service.yaml create mode 100644 sky/serve/service_spec.py diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index f6f45463bff..abcbe66ff71 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2401,54 +2401,59 @@ def _refresh_cluster(cluster_name): return kept_records -def refresh_service_status(service: Optional[str]) -> List[Dict[str, Any]]: - if service is None: +def refresh_service_status(service_name: Optional[str]) -> List[Dict[str, Any]]: + if service_name is None: service_records = global_user_state.get_services() else: - service_record = global_user_state.get_service_from_name(service) + service_record = global_user_state.get_service_from_name(service_name) if service_record is None: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service} does not exist.') + raise ValueError(f'Service {service_name} does not exist.') service_records = [service_record] # TODO(tian): Make it run in parallel. for record in service_records: - controller_cluster_name = record['controller_cluster_name'] endpoint = record['endpoint'] if not endpoint: continue # TODO(tian): Refactor: store ip and app_port separately. controller_ip = endpoint.split(':')[0] - with requests.Session() as session: + controller_url = f'http://{controller_ip}:{serve_lib.CONTROL_PLANE_PORT}' + try: + resp = requests.get(controller_url + + '/control_plane/get_replica_nums', + timeout=5) + except requests.RequestException: + pass + else: + record.update(resp.json()) + if record['status'] != status_lib.ServiceStatus.SHUTTING_DOWN: + # TODO(tian): Current behaviour for user bugs in setup section + # is to teardown and relaunching forever. We should have a way + # to detect such bugs and stop relaunching. + if record['num_failed_replicas'] > 0: + record['status'] = status_lib.ServiceStatus.FAILED + elif record['num_ready_replicas'] > 0: + record['status'] = status_lib.ServiceStatus.READY + elif record['num_unhealthy_replicas'] > 0: + record['status'] = status_lib.ServiceStatus.REPLICA_INIT + global_user_state.add_or_update_service(**record) + if service_name is not None: + assert record['name'] == service_name try: - resp = session.get( - f'http://{controller_ip}:{serve_lib.CONTROL_PLANE_PORT}/control_plane/get_replica_nums', - timeout=5) + resp = requests.get(controller_url + + '/control_plane/get_replica_info', + timeout=5) except requests.RequestException: pass else: - record.update(resp.json()) - if record['num_healthy_replicas'] > 0: - record['status'] = status_lib.ServiceStatus.RUNNING - elif record['num_unhealthy_replicas'] > 0: - record['status'] = status_lib.ServiceStatus.REPLICA_INIT - global_user_state.add_or_update_service(**record) - if service is not None: - assert record['name'] == service - try: - resp = session.get( - f'http://{controller_ip}:{serve_lib.CONTROL_PLANE_PORT}/control_plane/get_replica_info', - timeout=5) - except requests.RequestException: - pass - else: - record['replica_info'] = resp.json()['replica_info'] - decoded_info = [] - for info in record['replica_info']: - decoded_info.append({ - k: pickle.loads(base64.b64decode(v)) - for k, v in info.items() - }) - record['replica_info'] = decoded_info + record['replica_info'] = resp.json()['replica_info'] + decoded_info = [] + for info in record['replica_info']: + decoded_info.append({ + k: pickle.loads(base64.b64decode(v)) + for k, v in info.items() + }) + record['replica_info'] = decoded_info return service_records diff --git a/sky/cli.py b/sky/cli.py index 3cf9ddc87a0..37dd68f9c3b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3816,7 +3816,7 @@ def serve(): required=True, type=str, **_get_shell_complete_args(_complete_file_name)) -@click.option('--service', +@click.option('--service-name', '-s', default=None, type=str, @@ -3830,7 +3830,7 @@ def serve(): help='Skip confirmation prompt.') def serve_up( entrypoint: str, - service: Optional[str], + service_name: Optional[str], yes: bool, ): """Launches a SkyServe instance. @@ -3843,9 +3843,12 @@ def serve_up( sky serve up service.yaml """ - if service is None: - # TODO(tian): Check service name is unique. - service = backend_utils.generate_service_name() + if service_name is None: + service_name = backend_utils.generate_service_name() + + if global_user_state.get_service_from_name(service_name) is not None: + click.secho(f'Service {service_name!r} already exists.', fg='red') + return shell_splits = shlex.split(entrypoint) yaml_file_provided = (len(shell_splits) == 1 and @@ -3909,11 +3912,11 @@ def serve_up( return if not yes: - prompt = f'Launching a new service {service}. Proceed?' + prompt = f'Launching a new service {service_name}. Proceed?' if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) - sky.serve_up(task, service, entrypoint) + sky.serve_up(task, service_name) @serve.command('status', cls=_DocumentedCodeCommand) @@ -3923,35 +3926,53 @@ def serve_up( is_flag=True, required=False, help='Show all information in full.') -@click.argument('service', +@click.argument('service_name', required=False, type=str, **_get_shell_complete_args(_complete_service_name)) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def serve_status(all: bool, service: Optional[str]): - service_records = core.service_status(service) +def serve_status(all: bool, service_name: Optional[str]): + """Show statuses of SkyServe services. + + Examples: + + .. code-block:: bash + + # Show status for all services + sky serve status + \b + # Show detailed status for all services + sky serve status -a + \b + # Show service status and replica status for a specific service + sky serve status my-service + \b + # Show detailed service status and replica status for a specific service + sky serve status my-service -a + """ + service_records = core.service_status(service_name) click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' f'{colorama.Style.RESET_ALL}') status_utils.show_service_table(service_records, all) - if service is not None: + if service_name is not None: # If service not exist, we should already raise an error in # core.service_status. assert len(service_records) == 1, service_records service_record = service_records[0] if 'replica_info' not in service_record: - click.secho(f'Failed to refresh status of service: {service}.', + click.secho(f'Failed to refresh status of service: {service_name}.', fg='red') return - click.echo( - f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Replicas of {service}' - f'{colorama.Style.RESET_ALL}') + click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Replicas of {service_name}{colorama.Style.RESET_ALL}') status_utils.show_replica_table(service_record['replica_info'], all) @serve.command('down', cls=_DocumentedCodeCommand) -@click.argument('service', +@click.argument('service_name', required=True, + type=str, **_get_shell_complete_args(_complete_service_name)) @click.option('--yes', '-y', @@ -3966,7 +3987,7 @@ def serve_status(all: bool, service: Optional[str]): required=False, help='Ignore errors (if any). ') def serve_down( - service: str, + service_name: str, yes: bool, purge: bool, ): @@ -3979,10 +4000,79 @@ def serve_down( sky serve down my-service """ if not yes: - prompt = f'Tearing down service {service}. Proceed?' + prompt = f'Tearing down service {service_name}. Proceed?' click.confirm(prompt, default=True, abort=True, show_default=True) - sky.serve_down(service, purge) + sky.serve_down(service_name, purge) + + +@serve.command('logs', cls=_DocumentedCodeCommand) +@click.option( + '--follow/--no-follow', + is_flag=True, + default=True, + help=('Follow the logs of the job. [default: --follow] ' + 'If --no-follow is specified, print the log so far and exit.')) +@click.option('--control-plane', + '-c', + is_flag=True, + default=False, + required=False, + help='Show the control plane logs of this service.') +@click.option('--redirector', + '-r', + is_flag=True, + default=False, + required=False, + help='Show the redirector logs of this service.') +@click.option('--replica-id', + '-i', + default=None, + required=False, + help='Show the logs of a specific replica.') +@click.argument('service_name', + required=True, + type=str, + **_get_shell_complete_args(_complete_service_name)) +@usage_lib.entrypoint +def serve_logs( + service_name: str, + follow: bool, + control_plane: bool, + redirector: bool, + replica_id: Optional[str], +): + """Tail the log of a service. + + Example: + + .. code-block:: bash + + # Tail the control plane logs of a service + sky serve logs -c [SERVICE_ID] + \b + # Print the redirector logs so far and exit + sky serve logs -r --no-follow [SERVICE_ID] + """ + have_replica_id = replica_id is not None + if (control_plane + redirector + have_replica_id) != 1: + click.secho( + 'Only one of --control-plane, --redirector, --replica-id ' + 'can be specified. See `sky serve logs --help` for more ' + 'information.', + fg='red') + return + service_record = global_user_state.get_service_from_name(service_name) + if service_record is None: + click.secho(f'Service {service_name!r} not found.', fg='red') + return + controller_name = service_record['controller_cluster_name'] + if control_plane: + core.tail_logs(controller_name, job_id=1, follow=follow) + if redirector: + core.tail_logs(controller_name, job_id=2, follow=follow) + if have_replica_id: + raise NotImplementedError # ============================== diff --git a/sky/core.py b/sky/core.py index 71bca86b29c..b47cfb79df6 100644 --- a/sky/core.py +++ b/sky/core.py @@ -110,8 +110,8 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None, @usage_lib.entrypoint -def service_status(service: Optional[str]) -> List[Dict[str, Any]]: - return backend_utils.refresh_service_status(service) +def service_status(service_name: Optional[str]) -> List[Dict[str, Any]]: + return backend_utils.refresh_service_status(service_name) @usage_lib.entrypoint diff --git a/sky/execution.py b/sky/execution.py index e89d390852f..2ba9bfe3139 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -15,6 +15,7 @@ import copy import enum import getpass +import requests import tempfile import os import uuid @@ -951,8 +952,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): @usage_lib.entrypoint def serve_up( task: 'sky.Task', - name: str, - original_yaml_path: str, + service_name: str, ): """Serve up a service. @@ -964,30 +964,38 @@ def serve_up( Raises: """ - controller_cluster_name = serve.CONTROLLER_PREFIX + name + controller_cluster_name = serve.CONTROLLER_PREFIX + service_name assert task.service is not None, task policy = task.service.policy_str() assert len(task.resources) == 1 requested_resources = list(task.resources)[0] global_user_state.add_or_update_service( - name, controller_cluster_name, '', + service_name, controller_cluster_name, '', status_lib.ServiceStatus.CONTROLLER_INIT, 0, 0, 0, policy, requested_resources) app_port = int(task.service.app_port) assert len(task.resources) == 1, task - task.set_resources(list(task.resources)[0].copy(ports=[app_port])) + original_resources = list(task.resources)[0] + if original_resources.ports is not None and (len(original_resources.ports) + != 1): + if original_resources.ports[0] != app_port: + logger.warning('Ignoring port specification ' + f'{original_resources.ports} in resources.') + task.set_resources(original_resources.copy(ports=[app_port])) # TODO(tian): Use skyserve constants. + # TODO(tian): Clean up storage when the service is torn down. _maybe_translate_local_file_mounts_and_sync_up(task) - with tempfile.NamedTemporaryFile(prefix=f'serve-task-{name}-', + with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', mode='w') as f: task_config = task.to_yaml_config() if 'resources' in task_config and 'spot_recovery' in task_config[ 'resources']: del task_config['resources']['spot_recovery'] common_utils.dump_yaml(f.name, task_config) - remote_task_yaml_path = f'{serve.SERVICE_YAML_PREFIX}/service_{name}.yaml' + remote_task_yaml_path = (serve.SERVICE_YAML_PREFIX + + f'/service_{service_name}.yaml') vars_to_fill = { 'ports': [app_port, serve.CONTROL_PLANE_PORT], 'remote_task_yaml_path': remote_task_yaml_path, @@ -997,14 +1005,14 @@ def serve_up( 'disable_logging': env_options.Options.DISABLE_LOGGING.get(), } controller_yaml_path = os.path.join(serve.CONTROLLER_YAML_PREFIX, - f'{name}.yaml') + f'{service_name}.yaml') backend_utils.fill_template(serve.CONTROLLER_TEMPLATE, vars_to_fill, output_path=controller_yaml_path) controller_task = task_lib.Task.from_yaml(controller_yaml_path) assert len(controller_task.resources) == 1, controller_task print(f'{colorama.Fore.YELLOW}' - f'Launching controller for {name}...' + f'Launching controller for {service_name}...' f'{colorama.Style.RESET_ALL}') _execute( @@ -1018,11 +1026,19 @@ def serve_up( controller_cluster_name) assert isinstance(handle, backends.CloudVmRayResourceHandle) endpoint = f'{handle.head_ip}:{task.service.app_port}' - global_user_state.add_or_update_service( - name, controller_cluster_name, endpoint, - status_lib.ServiceStatus.REPLICA_INIT, 0, 0, 0, policy, - requested_resources) + controller_envs = { + 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': True, + 'SKYPILOT_DEV': env_options.Options.IS_DEVELOPER.get(), + 'SKYPILOT_DEBUG': env_options.Options.SHOW_DEBUG_INFO.get(), + 'SKYPILOT_DISABLE_USAGE_COLLECTION': + env_options.Options.DISABLE_LOGGING.get(), + } + + # NOTICE: The job submission order cannot be changed since the + # `sky serve logs` CLI will identify the control plane job with + # the first job submitted and the redirector job with the second + # job submitted. print( f'{colorama.Fore.YELLOW}' 'Launching control plane process on controller...' @@ -1031,8 +1047,9 @@ def serve_up( _execute( entrypoint=sky.Task( name='run-control-plane', + envs=controller_envs, run='python -m sky.serve.control_plane --service-name ' - f'{name} --task-yaml {remote_task_yaml_path} ' + f'{service_name} --task-yaml {remote_task_yaml_path} ' f'--port {serve.CONTROL_PLANE_PORT}'), stream_logs=False, handle=handle, @@ -1046,13 +1063,14 @@ def serve_up( 'Launching redirector process on controller...' f'{colorama.Style.RESET_ALL}', end='') + control_plane_addr = f'http://0.0.0.0:{serve.CONTROL_PLANE_PORT}' _execute( entrypoint=sky.Task( name='run-redirector', + envs=controller_envs, run='python -m sky.serve.redirector --task-yaml ' f'{remote_task_yaml_path} --port {app_port} ' - f'--control-plane-addr http://0.0.0.0:{serve.CONTROL_PLANE_PORT}' - ), + f'--control-plane-addr {control_plane_addr}'), stream_logs=False, handle=handle, stages=[Stage.EXEC], @@ -1060,14 +1078,34 @@ def serve_up( detach_run=True, ) - print(f'{colorama.Style.BRIGHT}{colorama.Fore.CYAN}Serving at ' + global_user_state.add_or_update_service( + service_name, controller_cluster_name, endpoint, + status_lib.ServiceStatus.REPLICA_INIT, 0, 0, 0, policy, + requested_resources) + + print(f'{colorama.Style.BRIGHT}{colorama.Fore.CYAN}' + 'Gateway endpoint serving at ' f'{colorama.Style.RESET_ALL}{colorama.Fore.CYAN}' - f'{endpoint}.\n' + f'{endpoint}.' f'{colorama.Style.RESET_ALL}') + print(f'\n{colorama.Fore.CYAN}Service name: ' + f'{colorama.Style.BRIGHT}{service_name}{colorama.Style.RESET_ALL}' + '\nTo see detailed info about replicas:' + f'\t{backend_utils.BOLD}sky serve status {service_name} (-a)' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of controller:' + f'\t\t{backend_utils.BOLD}sky serve logs -c {service_name}' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of redirector:' + f'\t\t{backend_utils.BOLD}sky serve logs -r {service_name}' + f'{backend_utils.RESET_BOLD}' + '\nTo teardown the service:' + f'\t\t{backend_utils.BOLD}sky serve down {service_name}' + f'{backend_utils.RESET_BOLD}') def serve_down( - name: str, + service_name: str, purge: bool, ): """Teardown a service. @@ -1079,58 +1117,63 @@ def serve_down( Raises: """ - service_record = global_user_state.get_service_from_name(name) + service_record = global_user_state.get_service_from_name(service_name) if service_record is None: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {name} does not exist.') + raise ValueError(f'Service {service_name} does not exist.') controller_cluster_name = service_record['controller_cluster_name'] - num_healthy_replicas = service_record['num_healthy_replicas'] + num_ready_replicas = service_record['num_ready_replicas'] num_unhealthy_replicas = service_record['num_unhealthy_replicas'] - num_replicas = num_healthy_replicas + num_unhealthy_replicas + num_failed_replicas = service_record['num_failed_replicas'] + num_replicas = (num_ready_replicas + num_unhealthy_replicas + + num_failed_replicas) + controller_ip = service_record['endpoint'].split(':')[0] + controller_url = f'http://{controller_ip}:{serve.CONTROL_PLANE_PORT}' handle = global_user_state.get_handle_from_cluster_name( controller_cluster_name) - global_user_state.set_service_status(name, + global_user_state.set_service_status(service_name, status_lib.ServiceStatus.SHUTTING_DOWN) - try: - print( - f'{colorama.Fore.YELLOW}' - f'Stopping control plane and redirector processes on controller...' - f'{colorama.Style.RESET_ALL}') - core.cancel(controller_cluster_name, all=True) - except (ValueError, sky.exceptions.ClusterNotUpError) as e: - if purge: - logger.warning(f'Ignoring error when stopping controller: {e}') - else: - raise e - try: if handle is not None: plural = '' - # TODO(tian): Change to #num replica (including failed one) if num_replicas > 1: plural = 's' print(f'{colorama.Fore.YELLOW}' f'Tearing down {num_replicas} replica{plural}...' f'{colorama.Style.RESET_ALL}') - _execute( - entrypoint=sky.Task(name='teardown-all-replicas', - run='sky down -a -y'), - stream_logs=False, - handle=handle, - stages=[Stage.EXEC], - cluster_name=controller_cluster_name, - detach_run=False, - ) - except (RuntimeError, ValueError) as e: + resp = requests.post(controller_url + '/control_plane/terminate', + data='') + if resp.status_code != 200: + raise RuntimeError('Failed to terminate replica due to ' + f'request failure: {resp.text}') + msg = resp.json()['message'] + if msg: + raise RuntimeError( + 'Unexpected message when tearing down ' + f'replica: {msg}. Please login to the controller ' + 'and make sure the service is properly cleaned.') + except (RuntimeError, ValueError, requests.exceptions.ConnectionError) as e: if purge: logger.warning(f'Ignoring error when cleaning controller: {e}') else: raise e + try: + print( + f'{colorama.Fore.YELLOW}' + f'Stopping control plane and redirector processes on controller...' + f'{colorama.Style.RESET_ALL}') + core.cancel(controller_cluster_name, all=True) + except (ValueError, sky.exceptions.ClusterNotUpError) as e: + if purge: + logger.warning(f'Ignoring error when stopping controller: {e}') + else: + raise e + try: print(f'{colorama.Fore.YELLOW}' - 'Teardown controller...' + 'Tearing down controller...' f'{colorama.Style.RESET_ALL}') core.down(controller_cluster_name, purge=purge) except (RuntimeError, ValueError) as e: @@ -1139,8 +1182,13 @@ def serve_down( else: raise e - global_user_state.remove_service(name) + # TODO(tian): Maybe add a post_cleanup function? + controller_yaml_path = os.path.join(serve.CONTROLLER_YAML_PREFIX, + f'{service_name}.yaml') + if os.path.exists(controller_yaml_path): + os.remove(controller_yaml_path) + global_user_state.remove_service(service_name) print(f'{colorama.Fore.GREEN}' - f'Tear down service {name} done.' + f'The tearing down of service {service_name} is done.' f'{colorama.Style.RESET_ALL}') diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 28c7717b812..c1b155437ef 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -100,7 +100,7 @@ def create_table(cursor, conn): controller_cluster_name TEXT, endpoint TEXT, status TEXT, - num_healthy_replicas INTEGER DEFAULT 0, + num_ready_replicas INTEGER DEFAULT 0, num_unhealthy_replicas INTEGER DEFAULT 0, num_failed_replicas INTEGER DEFAULT 0, policy TEXT, @@ -287,13 +287,13 @@ def add_or_update_cluster(cluster_name: str, def add_or_update_service( name: str, controller_cluster_name: str, endpoint: str, - status: status_lib.ServiceStatus, num_healthy_replicas: int, + status: status_lib.ServiceStatus, num_ready_replicas: int, num_unhealthy_replicas: int, num_failed_replicas, policy: str, requested_resources: Optional['resources_lib.Resources']): _DB.cursor.execute( 'INSERT or REPLACE INTO services' '(name, controller_cluster_name, endpoint, status, ' - 'num_healthy_replicas, num_unhealthy_replicas, ' + 'num_ready_replicas, num_unhealthy_replicas, ' 'num_failed_replicas, policy, requested_resources) ' 'VALUES (' # name @@ -304,7 +304,7 @@ def add_or_update_service( '?, ' # status '?, ' - # num_healthy_replicas + # num_ready_replicas '?, ' # num_unhealthy_replicas '?, ' @@ -324,8 +324,8 @@ def add_or_update_service( endpoint, # status status.value, - # num_healthy_replicas - num_healthy_replicas, + # num_ready_replicas + num_ready_replicas, # num_unhealthy_replicas num_unhealthy_replicas, # num_failed_replicas @@ -624,7 +624,7 @@ def get_service_from_name( # Explicitly specify the number of fields to unpack, so that # we can add new fields to the database in the future without # breaking the previous code. - (name, controller_cluster_name, endpoint, status, num_healthy_replicas, + (name, controller_cluster_name, endpoint, status, num_ready_replicas, num_unhealthy_replicas, num_failed_replicas, policy, requested_resources) = row[:9] # TODO: use namedtuple instead of dict @@ -633,7 +633,7 @@ def get_service_from_name( 'controller_cluster_name': controller_cluster_name, 'endpoint': endpoint, 'status': status_lib.ServiceStatus[status], - 'num_healthy_replicas': num_healthy_replicas, + 'num_ready_replicas': num_ready_replicas, 'num_unhealthy_replicas': num_unhealthy_replicas, 'num_failed_replicas': num_failed_replicas, 'policy': policy, @@ -673,7 +673,7 @@ def get_services() -> List[Dict[str, Any]]: rows = _DB.cursor.execute('select * from services').fetchall() records = [] for row in rows: - (name, controller_cluster_name, endpoint, status, num_healthy_replicas, + (name, controller_cluster_name, endpoint, status, num_ready_replicas, num_unhealthy_replicas, num_failed_replicas, policy, requested_resources) = row[:9] # TODO: use namedtuple instead of dict @@ -683,7 +683,7 @@ def get_services() -> List[Dict[str, Any]]: 'controller_cluster_name': controller_cluster_name, 'endpoint': endpoint, 'status': status_lib.ServiceStatus[status], - 'num_healthy_replicas': num_healthy_replicas, + 'num_ready_replicas': num_ready_replicas, 'num_unhealthy_replicas': num_unhealthy_replicas, 'num_failed_replicas': num_failed_replicas, 'policy': policy, diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index d17081db6d2..d2aa7a87b70 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -1,3 +1,5 @@ +"""Modules for SkyServe services.""" from sky.serve.constants import (CONTROLLER_PREFIX, CONTROLLER_TEMPLATE, CONTROLLER_YAML_PREFIX, SERVICE_YAML_PREFIX, CONTROL_PLANE_PORT) +from sky.serve.service_spec import SkyServiceSpec diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index ecf7116888d..44277ab4dd7 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -1,152 +1,143 @@ +"""Autoscalers: perform autoscaling by monitoring metrics.""" import logging +import threading import time - from typing import Optional -from sky.serve.infra_providers import InfraProvider -from sky.serve.load_balancers import LoadBalancer +from sky.serve import infra_providers +from sky.serve import constants logger = logging.getLogger(__name__) class Autoscaler: + """Abstract class for autoscalers.""" def __init__(self, - infra_provider: InfraProvider, - load_balancer: LoadBalancer, - frequency: int = 60): + infra_provider: infra_providers.InfraProvider, + frequency: int, + min_nodes: int = 1, + max_nodes: Optional[int] = None) -> None: self.infra_provider = infra_provider - self.load_balancer = load_balancer + self.min_nodes: int = min_nodes + # Default to fixed node, i.e. min_nodes == max_nodes. + self.max_nodes: int = max_nodes or min_nodes self.frequency = frequency # Time to sleep in seconds. + if frequency < constants.CONTROL_PLANE_SYNC_INTERVAL: + logger.warning('Autoscaler frequency is less than ' + 'control plane sync interval. It might ' + 'not always got the latest information.') - def evaluate_scaling(self): + def evaluate_scaling(self) -> None: raise NotImplementedError - def scale_up(self, num_nodes_to_add: int): + def scale_up(self, num_nodes_to_add: int) -> None: logger.debug(f'Scaling up by {num_nodes_to_add} nodes') self.infra_provider.scale_up(num_nodes_to_add) - def scale_down(self, num_nodes_to_remove): + def scale_down(self, num_nodes_to_remove: int) -> None: logger.debug(f'Scaling down by {num_nodes_to_remove} nodes') self.infra_provider.scale_down(num_nodes_to_remove) - def monitor(self): + def monitor(self) -> None: logger.info('Starting autoscaler monitor.') - while True: - self.evaluate_scaling() + while not self.monitor_thread_stop_event.is_set(): + try: + self.evaluate_scaling() + except Exception as e: # pylint: disable=broad-except + # No matter what error happens, we should keep the + # monitor running. + logger.error(f'Error in autoscaler monitor: {e}') time.sleep(self.frequency) + def start_monitor(self) -> None: + self.monitor_thread_stop_event = threading.Event() + self.monitor_thread = threading.Thread(target=self.monitor) + self.monitor_thread.start() -class LatencyThresholdAutoscaler(Autoscaler): - - def __init__(self, - *args, - upper_threshold: int = 50, - lower_threshold: int = 1, - min_nodes: int = 1, - **kwargs): - """ - Autoscaler that scales up when the average latency of all servers is above the upper threshold and scales down - when the average latency of all servers is below the lower threshold. - :param args: - :param upper_threshold: upper threshold for latency in seconds - :param lower_threshold: lower threshold for latency in seconds - :param min_nodes: minimum number of nodes to keep running - :param kwargs: - """ - super().__init__(*args, **kwargs) - self.upper_threshold = upper_threshold - self.lower_threshold = lower_threshold - self.min_nodes = min_nodes - - def evaluate_scaling(self): - server_loads = self.load_balancer.server_loads - if not server_loads: - return - - avg_latencies = [ - sum(latencies) / len(latencies) - for latencies in server_loads.values() - ] - - if all(latency > self.upper_threshold for latency in avg_latencies): - self.scale_up(1) - elif all(latency < self.lower_threshold for latency in avg_latencies): - if self.infra_provider.total_servers() > self.min_nodes: - self.scale_down(1) + def terminate_monitor(self) -> None: + self.monitor_thread_stop_event.set() + self.monitor_thread.join() class RequestRateAutoscaler(Autoscaler): - - def __init__(self, - *args, - min_nodes: int = 1, - max_nodes: Optional[int] = None, - upper_threshold: Optional[float] = None, - lower_threshold: Optional[float] = None, - cooldown: int = 60, - **kwargs): - """ - Autoscaler that scales when the number of requests in the given interval is above or below the upper threshold - :param args: - :param query_interval: - :param upper_threshold: - :param lower_threshold: - :param min_nodes: - :param cooldown: Seconds to wait before scaling again. - :param kwargs: - """ + """ + Autoscaler that scales when the number of requests in the given + interval is above or below the upper threshold. + """ + + def __init__(self, *args, upper_threshold: Optional[float], + lower_threshold: Optional[float], cooldown: int, + query_interval: int, **kwargs) -> None: super().__init__(*args, **kwargs) - self.min_nodes = min_nodes - self.max_nodes = max_nodes or min_nodes - self.query_interval = 60 # Therefore thresholds represent queries per minute. - self.upper_threshold = upper_threshold - self.lower_threshold = lower_threshold - self.cooldown = cooldown - self.last_scale_operation = 0 # Time of last scale operation. - - def evaluate_scaling(self): + # Cooldown between two scaling operations in seconds. + self.cooldown: int = cooldown + # Quesy interval for requests num. Every `query_interval` seconds, + # Autoscaler will received an update for number of requests from + # redirector. + self.query_interval: int = query_interval + # Time of last scale operation + self.last_scale_operation: float = 0. + # Number of requests in the last `query_interval` seconds. + self.num_requests: int = 0 + # Upper threshold for scale up. If None, no scale up. + self.upper_threshold: Optional[float] = upper_threshold + # Lower threshold for scale down. If None, no scale down. + self.lower_threshold: Optional[float] = lower_threshold + + def set_num_requests(self, num_requests: int) -> None: + self.num_requests = num_requests + + def get_query_interval(self) -> int: + return self.query_interval + + def evaluate_scaling(self) -> None: current_time = time.time() - - # Check if cooldown period has passed since the last scaling operation - if current_time - self.last_scale_operation < self.cooldown: - logger.info(f'Current time: {current_time}, ' - f'last scale operation: {self.last_scale_operation}, ' - f'cooldown: {self.cooldown}') - logger.info( - 'Cooldown period has not passed since last scaling operation. Skipping scaling.' - ) - return - - while (self.load_balancer.request_timestamps and - current_time - self.load_balancer.request_timestamps[0] > - self.query_interval): - self.load_balancer.request_timestamps.popleft() - - num_requests = len(self.load_balancer.request_timestamps) - num_requests = float( - num_requests) / 60 # Convert to requests per second. - num_nodes = self.infra_provider.total_servers() - requests_per_node = num_requests / num_nodes if num_nodes else num_requests # To account for zero case. + num_nodes = self.infra_provider.total_replica_num() + + # Check if cooldown period has passed since the last scaling operation. + # Only cooldown if bootstrapping is done. + if num_nodes >= self.min_nodes: + if current_time - self.last_scale_operation < self.cooldown: + logger.info( + f'Current time: {current_time}, ' + f'last scale operation: {self.last_scale_operation}, ' + f'cooldown: {self.cooldown}') + logger.info('Cooldown period has not passed since last scaling ' + 'operation. Skipping scaling.') + return + + # Convert to requests per second. + num_requests_per_second = float(self.num_requests) / self.query_interval + # Edge case: num_nodes is zero. + requests_per_node = (num_requests_per_second / num_nodes + if num_nodes else num_requests_per_second) logger.info(f'Requests per node: {requests_per_node}') - logger.info(f'Upper threshold: {self.upper_threshold} qps/node, ' - f'lower threshold: {self.lower_threshold} qps/node, ' - f'queries per node: {requests_per_node} qps/node') + # logger.info(f'Upper threshold: {self.upper_threshold} qps/node, ' + # f'lower threshold: {self.lower_threshold} qps/node, ' + # f'queries per node: {requests_per_node} qps/node') - scaled = True # Bootstrap case logger.info(f'Number of nodes: {num_nodes}') if num_nodes < self.min_nodes: - logger.info('Bootstrapping autoscaler.') + logger.info('Bootstrapping service.') self.scale_up(1) self.last_scale_operation = current_time - elif self.upper_threshold is not None and requests_per_node > self.upper_threshold: - if self.infra_provider.total_servers() < self.max_nodes: + elif (self.upper_threshold is not None and + requests_per_node > self.upper_threshold): + if num_nodes < self.max_nodes: + logger.info('Requests per node is above upper threshold ' + f'{self.upper_threshold}qps/node. ' + 'Scaling up by 1 node.') self.scale_up(1) self.last_scale_operation = current_time - elif self.lower_threshold is not None and requests_per_node < self.lower_threshold: - if self.infra_provider.total_servers() > self.min_nodes: + elif (self.lower_threshold is not None and + requests_per_node < self.lower_threshold): + if num_nodes > self.min_nodes: + logger.info('Requests per node is below lower threshold ' + f'{self.lower_threshold}qps/node. ' + 'Scaling down by 1 node.') self.scale_down(1) self.last_scale_operation = current_time else: diff --git a/sky/serve/common.py b/sky/serve/common.py deleted file mode 100644 index f5e66fdf4e1..00000000000 --- a/sky/serve/common.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing import Optional, Dict, Any - -from sky.backends import backend_utils -from sky.utils import schemas -from sky.utils import ux_utils - - -class SkyServiceSpec: - - def __init__( - self, - readiness_path: str, - readiness_timeout: int, - app_port: int, - min_replica: int, - max_replica: Optional[int] = None, - qps_upper_threshold: Optional[float] = None, - qps_lower_threshold: Optional[float] = None, - ): - if max_replica is not None and max_replica < min_replica: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'max_replica must be greater than or equal to min_replica') - # TODO: check if the path is valid - self._readiness_path = f':{app_port}{readiness_path}' - self._readiness_timeout = readiness_timeout - # TODO: check if the port is valid - self._app_port = str(app_port) - self._min_replica = min_replica - self._max_replica = max_replica - self._qps_upper_threshold = qps_upper_threshold - self._qps_lower_threshold = qps_lower_threshold - - @classmethod - def from_yaml_config(cls, config: Optional[Dict[str, Any]]): - if config is None: - return None - - backend_utils.validate_schema(config, schemas.get_service_schema(), - 'Invalid service YAML:') - - service_config = {} - service_config['readiness_path'] = config['readiness_probe']['path'] - service_config['readiness_timeout'] = config['readiness_probe'][ - 'readiness_timeout'] - service_config['app_port'] = config['port'] - service_config['min_replica'] = config['replica_policy']['min_replica'] - service_config['max_replica'] = config['replica_policy'].get( - 'max_replica', None) - service_config['qps_upper_threshold'] = config['replica_policy'].get( - 'qps_upper_threshold', None) - service_config['qps_lower_threshold'] = config['replica_policy'].get( - 'qps_lower_threshold', None) - - return SkyServiceSpec(**service_config) - - def to_yaml_config(self): - replica_policy = {} - - def add_if_not_none(key, value, no_empty: bool = False): - if no_empty and not value: - return - if value is not None: - replica_policy[key] = value - - add_if_not_none('min_replica', self.min_replica) - add_if_not_none('max_replica', self.max_replica) - add_if_not_none('qps_upper_threshold', self.qps_upper_threshold) - add_if_not_none('qps_lower_threshold', self.qps_lower_threshold) - - return { - 'port': int(self.app_port), - 'readiness_probe': { - 'path': self.readiness_path[len(f':{self.app_port}'):], - 'readiness_timeout': self.readiness_timeout, - }, - 'replica_policy': replica_policy, - } - - def policy_str(self): - if self.max_replica == self.min_replica or self.max_replica is None: - plural = '' - if self.min_replica > 1: - plural = 'S' - return f'FIXED NODE{plural}: {self.min_replica}' - # TODO(tian): Refactor to contain more information - return f'AUTOSCALE [{self.min_replica}, {self.max_replica}]' - - @property - def readiness_path(self): - return self._readiness_path - - @property - def readiness_timeout(self): - return self._readiness_timeout - - @property - def app_port(self): - return self._app_port - - @property - def min_replica(self): - return self._min_replica - - @property - def max_replica(self): - return self._max_replica - - @property - def qps_upper_threshold(self): - return self._qps_upper_threshold - - @property - def qps_lower_threshold(self): - return self._qps_lower_threshold diff --git a/sky/serve/constants.py b/sky/serve/constants.py index e8ac94662c3..19792331093 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -8,3 +8,4 @@ SERVICE_YAML_PREFIX = '~/.sky/service' CONTROL_PLANE_PORT = 31001 +CONTROL_PLANE_SYNC_INTERVAL = 20 diff --git a/sky/serve/control_plane.py b/sky/serve/control_plane.py index 8b710646a96..21fb5c2b4fe 100644 --- a/sky/serve/control_plane.py +++ b/sky/serve/control_plane.py @@ -1,21 +1,17 @@ -import logging +"""Control Plane: the central control plane of SkyServe. +Responsible for autoscaling and replica management. +""" import argparse - -from sky.serve.autoscalers import RequestRateAutoscaler, Autoscaler -from sky.serve.common import SkyServiceSpec -from sky.serve.infra_providers import InfraProvider, SkyPilotInfraProvider -from sky.serve.load_balancers import RoundRobinLoadBalancer, LoadBalancer - -import time -import threading -import yaml - +import fastapi +import logging from typing import Optional - -from fastapi import FastAPI, Request import uvicorn +from sky import serve +from sky.serve import autoscalers +from sky.serve import infra_providers + logging.basicConfig( level=logging.INFO, format='%(asctime)s | %(levelname)-6s | %(name)-40s || %(message)s', @@ -25,41 +21,45 @@ class ControlPlane: + """Control Plane: control everything about replica. + + This class is responsible for: + - Starting and terminating the replica monitor and autoscaler. + - Providing the HTTP Server API for SkyServe to communicate with. + """ def __init__(self, port: int, - infra_provider: InfraProvider, - load_balancer: LoadBalancer, - autoscaler: Optional[Autoscaler] = None): + infra_provider: infra_providers.InfraProvider, + autoscaler: Optional[autoscalers.Autoscaler] = None) -> None: self.port = port self.infra_provider = infra_provider - self.load_balancer = load_balancer self.autoscaler = autoscaler - self.app = FastAPI() - - def server_fetcher(self): - while True: - logger.info('Running server fetcher.') - server_ips = self.infra_provider.get_server_ips() - self.load_balancer.probe_endpoints(server_ips) - time.sleep(10) + self.app = fastapi.FastAPI() # TODO(tian): Authentication!!! - def run(self): + def run(self) -> None: - @self.app.post('/control_plane/increment_request_count') - async def increment_request_count(request: Request): + @self.app.post('/control_plane/get_num_requests') + async def get_num_requests(request: fastapi.Request): # await request request_data = await request.json() # get request data - count = 0 if 'counts' not in request_data else request_data['counts'] + num_requests = request_data['num_requests'] logger.info(f'Received request: {request_data}') - self.load_balancer.increment_request_count(count=count) + if isinstance(self.autoscaler, autoscalers.RequestRateAutoscaler): + self.autoscaler.set_num_requests(num_requests) return {'message': 'Success'} - @self.app.get('/control_plane/get_server_ips') - def get_server_ips(): - return {'server_ips': list(self.load_balancer.servers_queue)} + @self.app.get('/control_plane/get_autoscaler_query_interval') + def get_autoscaler_query_interval(): + if isinstance(self.autoscaler, autoscalers.RequestRateAutoscaler): + return {'query_interval': self.autoscaler.get_query_interval()} + return {'query_interval': None} + + @self.app.get('/control_plane/get_ready_replicas') + def get_ready_replicas(): + return {'ready_replicas': self.infra_provider.get_ready_replicas()} @self.app.get('/control_plane/get_replica_info') def get_replica_info(): @@ -68,25 +68,33 @@ def get_replica_info(): @self.app.get('/control_plane/get_replica_nums') def get_replica_nums(): return { - 'num_healthy_replicas': len(self.load_balancer.available_servers - ), + 'num_ready_replicas': self.infra_provider.ready_replica_num(), 'num_unhealthy_replicas': - self.infra_provider.total_servers() - - len(self.load_balancer.available_servers), - # TODO(tian): Detect error replicas - 'num_failed_replicas': 0 + self.infra_provider.unhealthy_replica_num(), + 'num_failed_replicas': self.infra_provider.failed_replica_num() } - # Run server_monitor and autoscaler.monitor (if autoscaler is defined) in separate threads in the background. This should not block the main thread. - server_fetcher_thread = threading.Thread(target=self.server_fetcher, - daemon=True) - server_fetcher_thread.start() - if self.autoscaler: - autoscaler_monitor_thread = threading.Thread( - target=self.autoscaler.monitor, daemon=True) - autoscaler_monitor_thread.start() - - logger.info(f'Sky Server started on http://0.0.0.0:{self.port}') + @self.app.post('/control_plane/terminate') + def terminate(request: fastapi.Request): + del request + # request_data = request.json() + # TODO(tian): Authentication!!! + logger.info('Terminating service...') + self.infra_provider.terminate_replica_fetcher() + if self.autoscaler is not None: + self.autoscaler.terminate_monitor() + msg = self.infra_provider.terminate() + return {'message': msg} + + # Run replica_monitor and autoscaler.monitor (if autoscaler is defined) + # in separate threads in the background. + # This should not block the main thread. + self.infra_provider.start_replica_fetcher() + if self.autoscaler is not None: + self.autoscaler.start_monitor() + + logger.info( + f'SkyServe Control Plane started on http://0.0.0.0:{self.port}') uvicorn.run(self.app, host='0.0.0.0', port=self.port) @@ -108,40 +116,25 @@ def get_replica_nums(): args = parser.parse_args() # ======= Infra Provider ========= - # infra_provider = DummyInfraProvider() - infra_provider = SkyPilotInfraProvider(args.task_yaml, args.service_name) - - # ======= Load Balancer ========= - with open(args.task_yaml, 'r') as f: - task = yaml.safe_load(f) - if 'service' not in task: - raise ValueError('Task YAML must have a "service" section') - service_config = task['service'] - service_spec = SkyServiceSpec.from_yaml_config(service_config) - # Select the load balancing policy: RoundRobinLoadBalancer or LeastLoadedLoadBalancer - load_balancer = RoundRobinLoadBalancer( - infra_provider=infra_provider, - endpoint_path=service_spec.readiness_path, - readiness_timeout=service_spec.readiness_timeout) - # load_balancer = LeastLoadedLoadBalancer(n=5) - # autoscaler = LatencyThresholdAutoscaler(load_balancer, - # upper_threshold=0.5, # 500ms - # lower_threshold=0.1) # 100ms + service_spec = serve.SkyServiceSpec.from_yaml(args.task_yaml) + _infra_provider = infra_providers.SkyPilotInfraProvider( + args.task_yaml, + args.service_name, + readiness_path=service_spec.readiness_path, + readiness_timeout=service_spec.readiness_timeout, + post_data=service_spec.post_data) # ======= Autoscaler ========= - # Create an autoscaler with the RequestRateAutoscaler policy. Thresholds are defined as requests per node in the defined interval. - autoscaler = RequestRateAutoscaler( - infra_provider, - load_balancer, - frequency=5, + _autoscaler = autoscalers.RequestRateAutoscaler( + _infra_provider, + frequency=20, min_nodes=service_spec.min_replica, max_nodes=service_spec.max_replica, upper_threshold=service_spec.qps_upper_threshold, lower_threshold=service_spec.qps_lower_threshold, - cooldown=60) + cooldown=60, + query_interval=60) # ======= ControlPlane ========= - # Create a control plane object and run it. - control_plane = ControlPlane(args.port, infra_provider, load_balancer, - autoscaler) + control_plane = ControlPlane(args.port, _infra_provider, _autoscaler) control_plane.run() diff --git a/sky/serve/examples/http_server/task.yaml b/sky/serve/examples/http_server/task.yaml index d0fe866f259..33a29eaa439 100644 --- a/sky/serve/examples/http_server/task.yaml +++ b/sky/serve/examples/http_server/task.yaml @@ -3,7 +3,7 @@ resources: ports: - 8081 -workdir: . +workdir: sky/serve/examples/http_server run: python3 server.py diff --git a/sky/serve/examples/stable_diffusion_service.yaml b/sky/serve/examples/stable_diffusion_service.yaml new file mode 100644 index 00000000000..bed040b881a --- /dev/null +++ b/sky/serve/examples/stable_diffusion_service.yaml @@ -0,0 +1,34 @@ +#SkyPilot YAML to run stable diffusion web tool on 1 V100 GPU. + +resources: + cloud: gcp + accelerators: V100:1 + +service: + port: 7860 + readiness_probe: + path: / + readiness_timeout: 1200 + replica_policy: + min_replica: 2 + +file_mounts: + /stable_diffusion: examples/stable_diffusion + +setup: | + sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + cd stable-diffusion-webui-docker + sudo rm -r stable-diffusion-webui-docker + git clone https://github.com/AbdBarho/stable-diffusion-webui-docker.git + cd stable-diffusion-webui-docker + git reset --hard 0d8b7d4ac8f9ba99e041ca332547eab9d65e6360 + wget https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt -P models + mv models/sd-v1-4.ckpt models/model.ckpt + docker pull berkeleyskypilot/stable-diffusion + rm docker-compose.yml + cp /stable_diffusion/docker-compose.yml . + +run: | + cd stable-diffusion-webui-docker + docker-compose up diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 0a4c931e678..f727584b881 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -1,222 +1,418 @@ +"""InfraProvider: handles the creation and deletion of endpoint replicas.""" +import base64 +import collections +from concurrent import futures import logging -from typing import List -import time +import multiprocessing +import os import pickle -import base64 +import requests +import signal +import threading +import time +from typing import List, Dict, Set, Optional, Any, Union import sky +from sky import backends +from sky import status_lib from sky.backends import backend_utils -import urllib -import threading - logger = logging.getLogger(__name__) +_PROCESS_POOL_REFRESH_INTERVAL = 20 +_ENDPOINT_PROBE_INTERVAL = 10 +# TODO(tian): Maybe let user determine this threshold +_CONTINUOUS_FAILURE_THRESHOLD = 180 // _ENDPOINT_PROBE_INTERVAL + class InfraProvider: + """Each infra provider manages one services.""" + + def __init__( + self, + readiness_path: str, + readiness_timeout: int, + post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: + self.ready_replicas: Set[str] = set() + self.unhealthy_replicas: Set[str] = set() + self.failed_replicas: Set[str] = set() + self.first_unhealthy_time: Dict[str, float] = dict() + self.continuous_failure: Dict[str, int] = collections.defaultdict(int) + self.readiness_path: str = readiness_path + self.readiness_timeout: int = readiness_timeout + self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data + logger.info(f'Readiness probe path: {self.readiness_path}') + logger.info(f'Post data: {self.post_data} ({type(self.post_data)})') + + def get_replica_info(self) -> List[Dict[str, str]]: + # Get replica info for all replicas + raise NotImplementedError + + def _get_replica_ips(self) -> Set[str]: + # Get all replica ips + raise NotImplementedError + + def total_replica_num(self) -> int: + # Returns the total number of replicas, including those under + # provisioning and deletion + raise NotImplementedError + + def ready_replica_num(self) -> int: + # Returns the total number of available replicas + raise NotImplementedError - def get_server_ips(self) -> List[str]: + def get_ready_replicas(self) -> Set[str]: + # Returns the endpoints of all ready replicas raise NotImplementedError - def total_servers(self) -> int: - # Returns the total number of servers, including those under provisioning and deletion + def unhealthy_replica_num(self) -> int: + # Returns the total number of unhealthy replicas + raise NotImplementedError + + def failed_replica_num(self) -> int: + # Returns the number of failed replicas raise NotImplementedError def scale_up(self, n: int) -> None: raise NotImplementedError def scale_down(self, n: int) -> None: - # TODO - Scale down must also pass in a list of servers to delete or the number of servers to delete + # TODO - Scale down must also pass in a list of replicas to + # delete or the number of replicas to delete raise NotImplementedError - def terminate_servers(self, unhealthy_servers: List[str]): - # Terminates the servers with endpoints in the list + def _terminate_replicas(self, unhealthy_replicas: Set[str]) -> None: + # Terminates the replicas with endpoints in the list raise NotImplementedError + def terminate(self) -> Optional[str]: + # Terminate service + raise NotImplementedError -class DummyInfraProvider(InfraProvider): - - def __init__(self): - self.DEFAULT_ENDPOINTS = [ - 'https://httpbin.org/get?id=basecase', 'https://www.google.com', - 'http://thiswebsitedoesntexistitsonlyfortesting.com' - ] - self.current_endpoints = self.DEFAULT_ENDPOINTS.copy() - - def get_server_ips(self) -> List[str]: - logger.info('Returning current endpoints: ' + - str(self.current_endpoints)) - return self.current_endpoints - - def total_servers(self) -> int: - return len(self.current_endpoints) - - def scale_up(self, n) -> None: - logger.info('DummyInfraProvider.scale_up called with n=' + str(n) + - '. Sleeping for 30s.') - for i in range(30): - logger.info('DummyInfraProvider.scale_up: ' + str(i) + '/30') - time.sleep(1) - # Add n new endpoints - for i in range(n): - self.current_endpoints.append('https://httpbin.org/get?id=' + - str(len(self.current_endpoints))) - logger.info('DummyInfraProvider.scale_up: done sleeping.') - - def scale_down(self, n) -> None: - logger.info('DummyInfraProvider.scale_down called with n=' + str(n) + - '. Doing nothing.') - - def terminate_servers(self, unhealthy_servers: List[str]): - # Remove unhealthy servers from current_endpoints - logger.info( - 'DummyInfraProvider.terminate_servers called with unhealthy_servers=' - + str(unhealthy_servers)) - self.current_endpoints = [ - endpoint for endpoint in self.current_endpoints - if endpoint not in unhealthy_servers - ] + def start_replica_fetcher(self) -> None: + # Start the replica fetcher thread + raise NotImplementedError + def terminate_replica_fetcher(self) -> None: + # Terminate the replica fetcher thread + raise NotImplementedError -class SkyPilotInfraProvider(InfraProvider): + def probe_all_endpoints(self) -> None: + # Probe readiness of all endpoints + raise NotImplementedError - def __init__(self, task_yaml_path: str, cluster_name_prefix: str): - self.task_yaml_path = task_yaml_path - self.cluster_name_prefix = cluster_name_prefix + '-' - self.id_counter = self._get_id_start() - def _get_id_start(self): +class SkyPilotInfraProvider(InfraProvider): + """Infra provider for SkyPilot clusters.""" + + def __init__(self, task_yaml_path: str, cluster_name_prefix: str, *args, + **kwargs) -> None: + super().__init__(*args, **kwargs) + self.task_yaml_path: str = task_yaml_path + self.cluster_name_prefix: str = cluster_name_prefix + '-' + self.id_counter: int = 1 + self.launch_process_pool: Dict[str, multiprocessing.Process] = dict() + self.down_process_pool: Dict[str, multiprocessing.Process] = dict() + + self._start_refresh_process_pool() + + def _refresh_process_pool(self) -> None: + while not self.refresh_process_pool_stop_event.is_set(): + logger.info('Refreshing process pool.') + for op, pool in zip( + ['Launch', 'Down'], + [self.launch_process_pool, self.down_process_pool]): + for cluster_name, p in list(pool.items()): + if not p.is_alive(): + # TODO(tian): Try-catch in process, and have an enum + # return value to indicate which type of failure + # happened. Currently we only have user code failure + # since the retry_until_up flag is set to True, but it + # will be helpful when we enable user choose whether to + # retry or not. + logger.info( + f'{op} process for {cluster_name} finished.') + del pool[cluster_name] + if p.exitcode != 0: + logger.info( + f'{op} process for {cluster_name} exited ' + f'abnormally with code {p.exitcode}.') + self.failed_replicas.add(cluster_name) + time.sleep(_PROCESS_POOL_REFRESH_INTERVAL) + + def _start_refresh_process_pool(self) -> None: + self.refresh_process_pool_stop_event = threading.Event() + self.refresh_process_pool_thread = threading.Thread( + target=self._refresh_process_pool) + self.refresh_process_pool_thread.start() + + def _terminate_refresh_process_pool(self) -> None: + self.refresh_process_pool_stop_event.set() + self.refresh_process_pool_thread.join() + + def _get_ip_clusname_map(self) -> Dict[str, str]: """ - Returns the id to start from when creating a new cluster + Returns a map of ip to cluster name for all clusters. """ clusters = sky.global_user_state.get_clusters() - # Filter out clusters that don't have the prefix - clusters = [ - cluster for cluster in clusters - if self.cluster_name_prefix in cluster['name'] - ] - # Get the greatest id - max_id = 0 - for cluster in clusters: - name = cluster['name'] - id = int(name.split('-')[-1]) - if id > max_id: - max_id = id - return max_id + 1 - - def _get_ip_clusname_map(self): - """Returns a map of ip to cluster name for all clusters with the prefix""" - clusters = sky.global_user_state.get_clusters() ip_clusname_map = {} + dummy_counter = 0 for cluster in clusters: name = cluster['name'] - if self.cluster_name_prefix in name: - handle = cluster['handle'] - try: - # Get the head node ip - ip = backend_utils.get_node_ips(handle.cluster_yaml, - handle.launched_nodes, - handle)[0] - ip_clusname_map[ip] = name - except sky.exceptions.FetchIPError: - logger.warning(f'Unable to get IP for cluster {name}.') - continue + handle = cluster['handle'] + try: + # Get the head node ip + ip = backend_utils.get_node_ips(handle.cluster_yaml, + handle.launched_nodes, + handle)[0] + ip_clusname_map[ip] = name + except sky.exceptions.FetchIPError: + logger.warning(f'Unable to get IP for cluster {name}.' + 'Use dummp IP instead.') + ip_clusname_map[f'10.0.0.{dummy_counter}'] = name + dummy_counter += 1 + continue return ip_clusname_map - def get_replica_info(self): + def get_replica_info(self) -> List[Dict[str, str]]: + + def _get_replica_status(cluster_status: status_lib.ClusterStatus, + ip: str) -> status_lib.ReplicaStatus: + if ip in self.ready_replicas: + return status_lib.ReplicaStatus.READY + if ip in self.failed_replicas: + return status_lib.ReplicaStatus.FAILED + if cluster_status == status_lib.ClusterStatus.UP: + return status_lib.ReplicaStatus.UNHEALTHY + return status_lib.ReplicaStatus.INIT + + # TODO(tian): Return failed replica info here if it is already + # be torn down. clusters = sky.global_user_state.get_clusters() infos = [] for cluster in clusters: - if self.cluster_name_prefix in cluster['name']: - info = { - 'name': cluster['name'], - 'handle': cluster['handle'], - 'status': cluster['status'], - } - info = { - k: base64.b64encode(pickle.dumps(v)).decode('utf-8') - for k, v in info.items() - } - infos.append(info) + handle = cluster['handle'] + assert isinstance(handle, backends.CloudVmRayResourceHandle) + ip = handle.head_ip + info = { + 'name': cluster['name'], + 'handle': handle, + 'status': _get_replica_status(cluster['status'], ip), + } + info = { + k: base64.b64encode(pickle.dumps(v)).decode('utf-8') + for k, v in info.items() + } + infos.append(info) return infos - def _get_server_ips(self): - return list(self._get_ip_clusname_map().keys()) + def _get_replica_ips(self) -> Set[str]: + ips = set(self._get_ip_clusname_map().keys()) + logger.info(f'Returning SkyPilot endpoints: {ips}') + return ips - def _return_total_servers(self): + def total_replica_num(self) -> int: clusters = sky.global_user_state.get_clusters() - # Filter out clusters that don't have the prefix - # FIXME - this is a hack to get around. should implement a better filtering mechanism - clusters = [ - cluster for cluster in clusters - if self.cluster_name_prefix in cluster['name'] - ] + # All replica launched in controller is a replica. return len(clusters) - def _scale_up(self, n): + def get_ready_replicas(self) -> Set[str]: + return self.ready_replicas + + def ready_replica_num(self) -> int: + return len(self.ready_replicas) + + def unhealthy_replica_num(self) -> int: + return len(self.unhealthy_replicas) + + def failed_replica_num(self) -> int: + return len(self.failed_replicas) + + def _launch_cluster(self, cluster_name: str, task: sky.Task) -> None: + p = multiprocessing.Process(target=sky.launch, + args=(task,), + kwargs={ + 'cluster_name': cluster_name, + 'detach_run': True, + 'retry_until_up': True + }) + self.launch_process_pool[cluster_name] = p + p.start() + + def _scale_up(self, n: int) -> None: # Launch n new clusters task = sky.Task.from_yaml(self.task_yaml_path) - for i in range(0, n): + for _ in range(0, n): cluster_name = f'{self.cluster_name_prefix}{self.id_counter}' logger.info(f'Creating SkyPilot cluster {cluster_name}') - sky.launch(task, - cluster_name=cluster_name, - detach_run=True, - retry_until_up=True) # TODO - make the launch parallel + self._launch_cluster(cluster_name, task) self.id_counter += 1 - def _scale_down(self, n): + def scale_up(self, n: int) -> None: + self._scale_up(n) + + def _teardown_cluster(self, cluster_name: str) -> None: + p = multiprocessing.Process(target=sky.down, + args=(cluster_name,), + kwargs={'purge': True}) + self.down_process_pool[cluster_name] = p + p.start() + + def _scale_down(self, n: int) -> None: # Delete n clusters # Currently deletes the first n clusters clusters = sky.global_user_state.get_clusters() - # Filter out clusters that don't have the prefix - clusters = [ - cluster for cluster in clusters - if self.cluster_name_prefix in cluster['name'] - ] num_clusters = len(clusters) if num_clusters > 0: if n > num_clusters: logger.warning( - f'Trying to delete {n} clusters, but only {num_clusters} clusters exist. Deleting all clusters.' - ) + f'Trying to delete {n} clusters, but only {num_clusters} ' + 'clusters exist. Deleting all clusters.') n = num_clusters for i in range(0, n): cluster = clusters[i] logger.info(f'Deleting SkyPilot cluster {cluster["name"]}') - sky.down(cluster['name'], purge=True) - - def get_server_ips(self) -> List[str]: - ips = self._get_server_ips() - logger.info(f'Returning SkyPilot endpoints: {ips}') - return ips - - def total_servers(self) -> int: - return self._return_total_servers() - - def scale_up(self, n: int) -> None: - self._scale_up(n) + self._teardown_cluster(cluster['name']) def scale_down(self, n: int) -> None: self._scale_down(n) - def terminate_servers(self, unhealthy_servers: List[str]): - # Remove unhealthy servers from current_endpoints - logger.info( - 'SkyPilotInfraProvider.terminate_servers called with unhealthy_servers=' - + str(unhealthy_servers)) - for endpoint_url in unhealthy_servers: + def _terminate_replicas(self, unhealthy_replicas: Set[str]) -> None: + # Remove unhealthy replicas from current_endpoints + logger.info('SkyPilotInfraProvider._terminate_replicas called with ' + f'unhealthy_replicas={unhealthy_replicas}') + for endpoint_url in unhealthy_replicas: ip_to_name_map = self._get_ip_clusname_map() if endpoint_url not in ip_to_name_map: logger.warning( - f'Unable to find cluster name for endpoint {endpoint_url}. Skipping.' - ) + f'Unable to find cluster name for endpoint {endpoint_url}. ' + 'Skipping.') continue name = ip_to_name_map[endpoint_url] - if endpoint_url in unhealthy_servers: + if endpoint_url in unhealthy_replicas: logger.info(f'Deleting SkyPilot cluster {name}') - threading.Thread(target=sky.down, - args=(name,), - kwargs={ - 'purge': True - }).start() + self._teardown_cluster(name) + + def terminate(self) -> Optional[str]: + # For correctly show serve status + self.ready_replicas.clear() + self.unhealthy_replicas = self._get_replica_ips() + self._terminate_refresh_process_pool() + for name, p in self.launch_process_pool.items(): + # Use keyboard interrupt here since sky.launch has great + # handling for it + # Edge case: sky.launched finished after the + # process_pool_refresh_process terminates + if p.is_alive(): + assert p.pid is not None + os.kill(p.pid, signal.SIGINT) + p.join() + self._teardown_cluster(name) + logger.info(f'Interrupted launch process for cluster {name}' + 'and deleted the cluster.') + replica_ips = self._get_replica_ips() + self._terminate_replicas(replica_ips) + msg = [] + for name, p in self.down_process_pool.items(): + p.join() + logger.info(f'Down process for cluster {name} finished.') + if p.exitcode != 0: + msg.append(f'Down process for cluster {name} exited abnormally' + f' with code {p.exitcode}. Please login to the ' + 'controller and make sure the cluster is released.') + if not msg: + return None + return '\n'.join(msg) + + def _replica_fetcher(self) -> None: + while not self.replica_fetcher_stop_event.is_set(): + logger.info('Running replica fetcher.') + try: + self.probe_all_endpoints() + except Exception as e: # pylint: disable=broad-except + # No matter what error happens, we should keep the + # replica fetcher running. + logger.error(f'Error in replica fetcher: {e}') + time.sleep(_ENDPOINT_PROBE_INTERVAL) + + def start_replica_fetcher(self) -> None: + self.replica_fetcher_stop_event = threading.Event() + self.replica_fetcher_thread = threading.Thread( + target=self._replica_fetcher) + self.replica_fetcher_thread.start() + + def terminate_replica_fetcher(self) -> None: + self.replica_fetcher_stop_event.set() + self.replica_fetcher_thread.join() + + def probe_all_endpoints(self) -> None: + replica_ips = self._get_replica_ips() - self.failed_replicas + + def probe_endpoint(replica_ip: str) -> Optional[str]: + try: + msg = '' + readiness_url = f'http://{replica_ip}{self.readiness_path}' + if self.post_data is not None: + msg += 'Post' + response = requests.post(readiness_url, + json=self.post_data, + timeout=3) + else: + msg += 'Get' + response = requests.get(readiness_url, timeout=3) + msg += (f' request to {replica_ip} returned status code ' + f'{response.status_code}') + if response.status_code == 200: + msg += '.' + else: + msg += f' and response {response.text}.' + logger.info(msg) + if response.status_code == 200: + logger.info(f'Replica {replica_ip} is available.') + return replica_ip + except requests.exceptions.RequestException as e: + logger.info(e) + logger.info(f'Replica {replica_ip} is not available.') + pass + return None + + with futures.ThreadPoolExecutor() as executor: + probe_futures = [ + executor.submit(probe_endpoint, replica_ip) + for replica_ip in replica_ips + ] + ready_replicas = set() + for future in futures.as_completed(probe_futures): + ip = future.result() + if ip is not None: + ready_replicas.add(ip) + + logger.info(f'Ready replicas: {ready_replicas}') + self.ready_replicas = ready_replicas + unhealthy_replicas = replica_ips - ready_replicas + logger.info(f'Unhealthy replicas: {unhealthy_replicas}') + self.unhealthy_replicas = unhealthy_replicas + + for replica in ready_replicas: + self.continuous_failure[replica] = 0 + + replicas_to_terminate = set() + for replica in unhealthy_replicas: + if replica not in self.first_unhealthy_time: + self.first_unhealthy_time[replica] = time.time() + self.continuous_failure[replica] += 1 + # coldstart time limitation is `self.readiness_timeout`. + first_unhealthy_time = self.first_unhealthy_time[replica] + if time.time() - first_unhealthy_time > self.readiness_timeout: + continuous_failure = self.continuous_failure[replica] + if continuous_failure > _CONTINUOUS_FAILURE_THRESHOLD: + logger.info(f'Terminating replica {replica}.') + replicas_to_terminate.add(replica) + else: + logger.info(f'Replica {replica} is unhealthy but ' + 'within unhealthy threshold. Skipping.') + else: + logger.info(f'Replica {replica} is unhealthy but within ' + 'readiness timeout. Skipping.') + + self._terminate_replicas(replicas_to_terminate) diff --git a/sky/serve/load_balancers.py b/sky/serve/load_balancers.py index f1d1b464a76..c04a3986d0b 100644 --- a/sky/serve/load_balancers.py +++ b/sky/serve/load_balancers.py @@ -1,164 +1,69 @@ -import time +"""LoadBalancer: select endpoint by load balancing algorithm.""" from collections import deque - -import aiohttp +import fastapi +import time import logging - -from concurrent.futures import ThreadPoolExecutor, as_completed -import requests +from typing import Optional, Deque, Set logger = logging.getLogger(__name__) +_DEFAULT_QUERY_INTERVAL = 60 + class LoadBalancer: + """Abstract class for load balancers.""" - def __init__(self, - infra_provider, - endpoint_path, - readiness_timeout, - post_data=None): - self.available_servers = [] - self.request_count = 0 - self.request_timestamps = deque() - self.infra_provider = infra_provider - self.endpoint_path = endpoint_path - self.readiness_timeout = readiness_timeout - self.post_data = post_data + def __init__(self) -> None: + self.ready_replicas: Set[str] = set() + self.request_count: int = 0 + self.request_timestamps: Deque[float] = deque() + self.query_interval: Optional[float] = None - def increment_request_count(self, count=1): + def increment_request_count(self, count: int = 1) -> None: self.request_count += count self.request_timestamps.append(time.time()) - def probe_endpoints(self, endpoint_ips): + def set_query_interval(self, query_interval: Optional[float]) -> None: + if query_interval is not None: + self.query_interval = query_interval + else: + self.query_interval = _DEFAULT_QUERY_INTERVAL + + def deprecate_old_requests(self) -> int: + if self.query_interval is None: + logger.error('Query interval is not set. ' + 'Use default interval instead.') + self.set_query_interval(None) + assert self.query_interval is not None + # TODO(tian): Optimize by binary search. + while (self.request_timestamps and + time.time() - self.request_timestamps[0] > self.query_interval): + self.request_timestamps.popleft() + return len(self.request_timestamps) + + def set_ready_replicas(self, ready_replicas: Set[str]) -> None: raise NotImplementedError - def select_server(self, request): + def select_replica(self, request: fastapi.Request) -> Optional[str]: raise NotImplementedError class RoundRobinLoadBalancer(LoadBalancer): + """Round-robin load balancer.""" - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.servers_queue = deque() - self.first_unhealthy_time = {} - logger.info(f'Endpoint path: {self.endpoint_path}') - - def probe_endpoints(self, endpoint_ips): - - def probe_endpoint(endpoint_ip): - try: - if self.post_data: - response = requests.post( - f'http://{endpoint_ip}{self.endpoint_path}', - json=self.post_data, - timeout=3) - else: - response = requests.get( - f'http://{endpoint_ip}{self.endpoint_path}', timeout=3) - if response.status_code == 200: - logger.info(f'Server {endpoint_ip} is available.') - return endpoint_ip - except requests.exceptions.RequestException as e: - logger.info(e) - logger.info(f'Server {endpoint_ip} is not available.') - pass - return None - - with ThreadPoolExecutor() as executor: - futures = [ - executor.submit(probe_endpoint, endpoint_url) - for endpoint_url in endpoint_ips - ] - healthy_servers = [ - future.result() - for future in as_completed(futures) - if future.result() is not None - ] - logger.info(f'Healthy servers: {healthy_servers}') - # Add newly available servers - for server in healthy_servers: - if server not in self.available_servers: - logger.info( - f'Server {server} is newly available. Adding to available servers.' - ) - self.available_servers.append(server) - self.servers_queue.append(server) - # Remove servers that are no longer available - unhealthy_servers = set() - for server in self.available_servers: - if server not in healthy_servers: - logger.info( - f'Server {server} is no longer available. Removing from available servers.' - ) - self.available_servers.remove(server) - self.servers_queue.remove(server) - unhealthy_servers.add(server) - # Tell the infra provider to remove endpoints that are no longer available - for server in endpoint_ips: - if server not in healthy_servers: - unhealthy_servers.add(server) - logger.info(f'Unhealthy servers: {unhealthy_servers}') - if unhealthy_servers: - servers_to_terminate = [] - for server in unhealthy_servers: - if server not in self.first_unhealthy_time: - self.first_unhealthy_time[server] = time.time() - elif time.time() - self.first_unhealthy_time[ - server] > self.readiness_timeout: # cooldown before terminating a dead server to avoid hysterisis - servers_to_terminate.append(server) - self.infra_provider.terminate_servers(servers_to_terminate) - - def select_server(self, request): - if not self.servers_queue: - return None - - server_ip = self.servers_queue.popleft() - self.servers_queue.append(server_ip) - logger.info(f'Selected server {server_ip} for request {request}') - return server_ip - - -class LeastLoadedLoadBalancer(LoadBalancer): - - def __init__(self, *args, n=10, **kwargs): - + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.server_loads = {} - self.n = n - - def probe_endpoints(self, endpoint_ips): - timeout = aiohttp.ClientTimeout(total=2) - with aiohttp.ClientSession(timeout=timeout) as session: - for server_ip in endpoint_ips: - try: - start_time = time() - with session.get(f'{server_ip}') as response: - if response.status == 200: - load = time() - start_time + self.replicas_queue: Deque[str] = deque() - if server_ip not in self.server_loads: - self.server_loads[server_ip] = [load] * self.n - else: - self.server_loads[server_ip].append(load) - if len(self.server_loads[server_ip]) > self.n: - self.server_loads[server_ip].pop(0) + def set_ready_replicas(self, ready_replicas: Set[str]) -> None: + if set(ready_replicas) != set(self.ready_replicas): + self.ready_replicas = ready_replicas + self.replicas_queue = deque(ready_replicas) - if server_ip not in self.available_servers: - self.available_servers.append(server_ip) - elif server_ip in self.available_servers: - self.available_servers.remove(server_ip) - del self.server_loads[server_ip] - except: - if server_ip in self.available_servers: - self.available_servers.remove(server_ip) - del self.server_loads[server_ip] - - def select_server(self, request): - if not self.server_loads: + def select_replica(self, request: fastapi.Request) -> Optional[str]: + if not self.replicas_queue: return None - - server_ip = min( - self.server_loads, - key=lambda x: sum(self.server_loads[x]) / len(self.server_loads[x])) - return server_ip + replica_ip = self.replicas_queue.popleft() + self.replicas_queue.append(replica_ip) + logger.info(f'Selected replica {replica_ip} for request {request}') + return replica_ip diff --git a/sky/serve/redirector.py b/sky/serve/redirector.py index 95b364f184a..e1507cef8e7 100644 --- a/sky/serve/redirector.py +++ b/sky/serve/redirector.py @@ -1,18 +1,14 @@ -import time +"""Redirector: redirect any incoming request to an endpoint replica.""" +import argparse +import fastapi import logging -import yaml -from collections import deque -from typing import List, Deque - -from sky.serve.common import SkyServiceSpec - -from fastapi import FastAPI, Request, HTTPException -from fastapi.responses import RedirectResponse import threading +import time import uvicorn - import requests -import argparse + +from sky.serve import constants +from sky.serve import load_balancers logging.basicConfig( level=logging.INFO, @@ -24,75 +20,81 @@ class SkyServeRedirector: + """Redirector: redirect incoming traffic. + + This class accept any traffic to the controller and redirect it + to the appropriate endpoint replica. + """ - def __init__(self, - control_plane_url: str, - service_spec: SkyServiceSpec, - port: int = 8081): + def __init__(self, control_plane_url: str, port: int, + load_balancer: load_balancers.LoadBalancer): + self.app = fastapi.FastAPI() self.control_plane_url = control_plane_url self.port = port - self.app_port = service_spec.app_port - self.server_ips: List[str] = [] - self.servers_queue: Deque[str] = deque() - self.app = FastAPI() - self.request_count = 0 - self.control_plane_sync_timeout = 20 - - def sync_with_control_plane(self): + self.load_balancer = load_balancer + + for i in range(3): + resp = requests.get(self.control_plane_url + + '/control_plane/get_autoscaler_query_interval') + if resp.status_code == 200: + self.load_balancer.set_query_interval( + resp.json()['query_interval']) + break + if i == 2: + logger.error('Failed to get autoscaler query interval. ' + 'Use default interval instead.') + self.load_balancer.set_query_interval(None) + time.sleep(10) + + def _sync_with_control_plane(self): while True: - server_ips = [] with requests.Session() as session: try: - # send request count + # send request num in last query interval response = session.post( self.control_plane_url + - '/control_plane/increment_request_count', - json={'counts': self.request_count}, + '/control_plane/get_num_requests', + json={ + 'num_requests': + self.load_balancer.deprecate_old_requests() + }, timeout=5) response.raise_for_status() - self.request_count = 0 - # get server ips + # get replica ips response = session.get(self.control_plane_url + - '/control_plane/get_server_ips') + '/control_plane/get_ready_replicas') response.raise_for_status() - server_ips = response.json()['server_ips'] + ready_replicas = response.json()['ready_replicas'] except requests.RequestException as e: print(f'An error occurred: {e}') else: - logger.info(f'Server IPs: {server_ips}') - self.servers_queue = deque(server_ips) - time.sleep(self.control_plane_sync_timeout) - - def select_server(self): - if not self.servers_queue: - return None - server_ip = self.servers_queue.popleft() - self.servers_queue.append(server_ip) - return server_ip - - async def redirector_handler(self, request: Request): - self.request_count += 1 - server_ip = self.select_server() - - if server_ip is None: - raise HTTPException(status_code=503, detail='No available servers') - logger.info(f'Redirecting request to {server_ip}{request.url.path}') - - path = f'http://{server_ip}:{self.app_port}{request.url.path}' + logger.info(f'Available Replica IPs: {ready_replicas}') + self.load_balancer.set_ready_replicas(ready_replicas) + time.sleep(constants.CONTROL_PLANE_SYNC_INTERVAL) + + async def _redirector_handler(self, request: fastapi.Request): + self.load_balancer.increment_request_count(1) + replica_ip = self.load_balancer.select_replica(request) + + if replica_ip is None: + raise fastapi.HTTPException(status_code=503, + detail='No available replicas') + + path = f'http://{replica_ip}:{self.port}{request.url.path}' logger.info(f'Redirecting request to {path}') - return RedirectResponse(url=path) + return fastapi.responses.RedirectResponse(url=path) - def serve(self): + def run(self): self.app.add_api_route('/{path:path}', - self.redirector_handler, + self._redirector_handler, methods=['GET', 'POST', 'PUT', 'DELETE']) - server_fetcher_thread = threading.Thread( - target=self.sync_with_control_plane, daemon=True) - server_fetcher_thread.start() + sync_control_plane_thread = threading.Thread( + target=self._sync_with_control_plane, daemon=True) + sync_control_plane_thread.start() - logger.info(f'Sky Server started on http://0.0.0.0:{self.port}') - logger.info('Sky Serve Redirector is ready to serve.') + logger.info( + f'SkyServe Redirector started on http://0.0.0.0:{self.port}') uvicorn.run(self.app, host='0.0.0.0', port=self.port) @@ -107,7 +109,7 @@ def serve(self): parser.add_argument('--port', '-p', type=int, - help='Port to run the redirector on', + help='Port to run the redirector on.', required=True) parser.add_argument('--control-plane-addr', type=str, @@ -115,14 +117,11 @@ def serve(self): required=True) args = parser.parse_args() - with open(args.task_yaml, 'r') as f: - task = yaml.safe_load(f) - if 'service' not in task: - raise ValueError('Task YAML must have a "service" section') - service_config = task['service'] - service_spec = SkyServiceSpec.from_yaml_config(service_config) + # ======= Load Balancer ========= + _load_balancer = load_balancers.RoundRobinLoadBalancer() + # ======= Redirector ========= redirector = SkyServeRedirector(control_plane_url=args.control_plane_addr, - service_spec=service_spec, - port=args.port) - redirector.serve() + port=args.port, + load_balancer=_load_balancer) + redirector.run() diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py new file mode 100644 index 00000000000..133e038f366 --- /dev/null +++ b/sky/serve/service_spec.py @@ -0,0 +1,160 @@ +"""Service specification for SkyServe.""" +import os +import yaml +from typing import Optional, Dict, Any + +from sky.backends import backend_utils +from sky.serve import constants +from sky.utils import schemas +from sky.utils import ux_utils + + +class SkyServiceSpec: + """SkyServe service specification.""" + + def __init__( + self, + readiness_path: str, + readiness_timeout: int, + app_port: int, + min_replica: int, + max_replica: Optional[int] = None, + qps_upper_threshold: Optional[float] = None, + qps_lower_threshold: Optional[float] = None, + post_data: Optional[Dict[str, Any]] = None, + ): + if max_replica is not None and max_replica < min_replica: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'max_replica must be greater than or equal to min_replica') + if app_port == constants.CONTROL_PLANE_PORT: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'App port cannot be {constants.CONTROL_PLANE_PORT} ' + 'since it is reserved for the control plane. ' + ' Please use a different port.') + # TODO: check if the path is valid + self._readiness_path = f':{app_port}{readiness_path}' + self._readiness_timeout = readiness_timeout + # TODO: check if the port is valid + self._app_port = str(app_port) + self._min_replica = min_replica + self._max_replica = max_replica + self._qps_upper_threshold = qps_upper_threshold + self._qps_lower_threshold = qps_lower_threshold + self._post_data = post_data + + @staticmethod + def from_yaml_config(config: Optional[Dict[str, Any]]): + if config is None: + return None + + backend_utils.validate_schema(config, schemas.get_service_schema(), + 'Invalid service YAML:') + + service_config = {} + service_config['readiness_path'] = config['readiness_probe']['path'] + service_config['readiness_timeout'] = config['readiness_probe'][ + 'readiness_timeout'] + service_config['app_port'] = config['port'] + service_config['min_replica'] = config['replica_policy']['min_replica'] + service_config['max_replica'] = config['replica_policy'].get( + 'max_replica', None) + service_config['qps_upper_threshold'] = config['replica_policy'].get( + 'qps_upper_threshold', None) + service_config['qps_lower_threshold'] = config['replica_policy'].get( + 'qps_lower_threshold', None) + service_config['post_data'] = config['readiness_probe'].get( + 'post_data', None) + + return SkyServiceSpec(**service_config) + + @staticmethod + def from_yaml(yaml_path: str): + with open(os.path.expanduser(yaml_path), 'r') as f: + config = yaml.safe_load(f) + + if isinstance(config, str): + with ux_utils.print_exception_no_traceback(): + raise ValueError('YAML loaded as str, not as dict. ' + f'Is it correct? Path: {yaml_path}') + + if config is None: + config = {} + + if 'service' not in config: + with ux_utils.print_exception_no_traceback(): + raise ValueError('Service YAML must have a "service" section. ' + f'Is it correct? Path: {yaml_path}') + + return SkyServiceSpec.from_yaml_config(config['service']) + + def to_yaml_config(self): + config = dict() + + def add_if_not_none(section, key, value, no_empty: bool = False): + if no_empty and not value: + return + if value is not None: + if key is None: + config[section] = value + else: + if section not in config: + config[section] = dict() + config[section][key] = value + + add_if_not_none('port', None, int(self.app_port)) + add_if_not_none('readiness_probe', 'path', + self.readiness_path[len(f':{self.app_port}'):]) + add_if_not_none('readiness_probe', 'readiness_timeout', + self.readiness_timeout) + add_if_not_none('readiness_probe', 'post_data', self.post_data) + add_if_not_none('replica_policy', 'min_replica', self.min_replica) + add_if_not_none('replica_policy', 'max_replica', self.max_replica) + add_if_not_none('replica_policy', 'qps_upper_threshold', + self.qps_upper_threshold) + add_if_not_none('replica_policy', 'qps_lower_threshold', + self.qps_lower_threshold) + + return config + + def policy_str(self): + if self.max_replica == self.min_replica or self.max_replica is None: + plural = '' + if self.min_replica > 1: + plural = 'S' + return f'#REPLICA{plural}: {self.min_replica}' + # TODO(tian): Refactor to contain more information + return f'AUTOSCALE [{self.min_replica}, {self.max_replica}]' + + @property + def readiness_path(self) -> str: + return self._readiness_path + + @property + def readiness_timeout(self) -> int: + return self._readiness_timeout + + @property + def app_port(self) -> str: + return self._app_port + + @property + def min_replica(self) -> int: + return self._min_replica + + @property + def max_replica(self) -> Optional[int]: + return self._max_replica + + @property + def qps_upper_threshold(self) -> Optional[float]: + return self._qps_upper_threshold + + @property + def qps_lower_threshold(self) -> Optional[float]: + return self._qps_lower_threshold + + @property + def post_data(self) -> Optional[Dict[str, Any]]: + return self._post_data diff --git a/sky/status_lib.py b/sky/status_lib.py index ff977b009e2..5ce63feda69 100644 --- a/sky/status_lib.py +++ b/sky/status_lib.py @@ -61,9 +61,9 @@ class ServiceStatus(enum.Enum): REPLICA_INIT = 'REPLICA_INIT' # At least one replica is ready - RUNNING = 'RUNNING' + READY = 'READY' - # Service is being stopped + # Service is being shutting down SHUTTING_DOWN = 'SHUTTING_DOWN' # At least one replica is failed @@ -77,9 +77,35 @@ def colored_str(self): _SERVICE_STATUS_TO_COLOR = { ServiceStatus.CONTROLLER_INIT: colorama.Fore.BLUE, ServiceStatus.REPLICA_INIT: colorama.Fore.BLUE, - ServiceStatus.RUNNING: colorama.Fore.GREEN, + ServiceStatus.READY: colorama.Fore.GREEN, ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, ServiceStatus.FAILED: colorama.Fore.RED, } -# TODO(tian): Add status for replicas to distinguish 'skypilot UP' and 'health probe succeeded' + +class ReplicaStatus(enum.Enum): + """Replica status.""" + + # Replica is initializing + INIT = 'INIT' + + # Replica is running + READY = 'READY' + + # Replica is unhealthy (e.g., health probe failed) + UNHEALTHY = 'UNHEALTHY' + + # Replica is failed + FAILED = 'FAILED' + + def colored_str(self): + color = _REPLICA_STATUS_TO_COLOR[self] + return f'{color}{self.value}{colorama.Style.RESET_ALL}' + + +_REPLICA_STATUS_TO_COLOR = { + ReplicaStatus.INIT: colorama.Fore.BLUE, + ReplicaStatus.READY: colorama.Fore.GREEN, + ReplicaStatus.UNHEALTHY: colorama.Fore.YELLOW, + ReplicaStatus.FAILED: colorama.Fore.RED, +} diff --git a/sky/task.py b/sky/task.py index 150113da5f1..43c6c0657bf 100644 --- a/sky/task.py +++ b/sky/task.py @@ -12,10 +12,10 @@ from sky import clouds from sky import exceptions from sky import global_user_state +from sky import serve as serve_lib from sky.backends import backend_utils from sky.data import storage as storage_lib from sky.data import data_utils -from sky.serve import common from sky.skylet import constants from sky.utils import schemas from sky.utils import ux_utils @@ -195,7 +195,7 @@ def __init__( self.estimated_outputs_size_gigabytes = None # Default to CPUNode self.resources = {sky.Resources()} - self._service = None + self._service: Optional[serve_lib.SkyServiceSpec] = None self.time_estimator_func: Optional[Callable[['sky.Resources'], int]] = None self.file_mounts: Optional[Dict[str, str]] = None @@ -370,7 +370,7 @@ def from_yaml_config( task.set_resources({resources}) service = config.pop('service', None) - service = common.SkyServiceSpec.from_yaml_config(service) + service = serve_lib.SkyServiceSpec.from_yaml_config(service) task.set_service(service) assert not config, f'Invalid task args: {config.keys()}' @@ -533,10 +533,11 @@ def get_resources(self): return self.resources @property - def service(self) -> Optional[common.SkyServiceSpec]: + def service(self) -> Optional[serve_lib.SkyServiceSpec]: return self._service - def set_service(self, service: Optional[common.SkyServiceSpec]) -> 'Task': + def set_service(self, + service: Optional[serve_lib.SkyServiceSpec]) -> 'Task': """Sets the service spec for this task. Args: diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 331494b672e..541425985e4 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -117,10 +117,9 @@ def show_service_table(service_records: List[_ServiceRecord], show_all: bool): _get_controller_cluster_name, show_by_default=False), StatusColumn('ENDPOINT', _get_endpoint), - StatusColumn('#HEALTHY_REPLICAS', _get_healthy_replicas), + StatusColumn('#READY_REPLICAS', _get_ready_replicas), StatusColumn('#UNHEALTHY_REPLICAS', _get_unhealthy_replicas), - # TODO(tian): After we have a better way to detect failed replicas - # StatusColumn('#FAILED_REPLICAS', _get_failed_replicas), + StatusColumn('#FAILED_REPLICAS', _get_failed_replicas), StatusColumn('STATUS', _get_service_status_colored), StatusColumn('POLICY', _get_policy, show_by_default=False), StatusColumn('REQUESTED_RESOURCES', @@ -375,8 +374,8 @@ def show_local_status_table(local_clusters: List[str]): _get_controller_cluster_name = ( lambda service_record: service_record['controller_cluster_name']) _get_endpoint = (lambda service_record: service_record['endpoint']) -_get_healthy_replicas = ( - lambda service_record: service_record['num_healthy_replicas']) +_get_ready_replicas = ( + lambda service_record: service_record['num_ready_replicas']) _get_unhealthy_replicas = ( lambda service_record: service_record['num_unhealthy_replicas']) _get_failed_replicas = ( diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index c44fd07f941..6668907a4f0 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -158,6 +158,13 @@ def get_service_schema(): 'readiness_timeout': { 'type': 'number', }, + 'post_data': { + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'object', + }] + } } }, 'replica_policy': { From 05da33fdb74895c8c176a63b1fd354730721b626 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sat, 12 Aug 2023 23:12:10 -0700 Subject: [PATCH 009/223] [SkyServe] Use SSH for Authentication, new replica status, `sky serve logs` for replica info (#2353) * introducing multiprocessing prototype * add run env to controller & redirector * reefactor and format * add control-plane and redirector logs * minor * minor * Refactor: move to infra provider * Refactor: move load balancer to redirector * refactor, add more logging * add replica status * resolve some TODOs * add post data feature * rename, format * add error message handling * bug fix & logging * fix a bug in continuous unhealthy * add error when user port is same with control plane * fix None post_data bug * add stable diffusion example * remove response body when code == 200 * add some TODOs and change RUNNING to READY * add failed status * add TODO for return failed replica info * fix sky serve status --help error * add console help messages * remove redundant stable diffusion setup files * rename healthy_replica --> ready_replica * finish replica info & num * finish * adopt advice from code review * rename to service_name * finish state machine; TODO property based implementation * adopt advice from comment * adopt comments in #2311 * finish new replica status * modify http example more resonable * UX details & set default controller resources to VCPU=4 * add spinner for launching contorl plane & redirector process * add sky serve logs CLI for replicas * add uptime section for service table * relaunch replicas which terminated by exceeding consecutive failure threshold * UX details * code style * move serve dependency to controller yaml setup section * add launch log for replica * add resources preview * stop jupyter service to avoid port conflict * Apply suggestions from code review Co-authored-by: Wei-Lin Chiang * fix userjob failed and launch failed not terminate replica; replica status FAILED --> CLEANUP_FAILED since we terminate all FAILED replica immediately now; remove --purge in termination * ux nits * 0.0.0.0 -> localhost * new log logic: use cluster status == UP instead of waiting 10s; early quit for replica not exist; skip all detailed file sync log * ux nits * change readiness timeout to initial delay seconds * disable some logging when SKYPILOT_DEBUG is not set * restore debug yaml * remove debug message * sync down log before teardown * rename failed status name (replica) * change controller resources vcpu to 4+ to avoid no 4 vcpu cloud * disable -c, -r, -i in sky serve logs CLI * add REPLICA column in service status * add CONTROLLER_FAILED status; wait until control plane & redirector job to be running. * add color for CONTROLLER_FAILED and a prompt to cleanup first if re-up a failed service * change uptime to first time ready * format * add comment for replica/service status in sky serve status -h * simplify yaml design * remove controller resources cloud=gcp * remove controller resources cloud=gcpsome comment * redirect setup logs to devnull * redirector listen on 0.0.0.0 & add app_port to controller resources * ux * fix readiness suffix * fix * fix * remove cloud=gcp * ux: remove reduncant str * disable launch & down & stop with reserved prefix controller- * support sky serve down service-* * ux * cleanup cloud storage when terminate * enable customized controller resources * abort if ports specified in resources * reorder service status column * new sky serve status: show replica all the time; refresh in parallel; check network first * remove name since we have service name column * at least one replica is ready -> service ready * Update sky/cli.py Co-authored-by: Wei-Lin Chiang * Update sky/backends/backend_utils.py Co-authored-by: Wei-Lin Chiang * Update sky/status_lib.py Co-authored-by: Wei-Lin Chiang * Update sky/backends/backend_utils.py Co-authored-by: Wei-Lin Chiang * Update sky/backends/backend_utils.py Co-authored-by: Wei-Lin Chiang * Update sky/serve/redirector.py Co-authored-by: Wei-Lin Chiang * Update sky/serve/redirector.py Co-authored-by: Wei-Lin Chiang * add vllm example * upd http example * change uptime to None and merge get_uptime and get_replica_info * restore debug comment out code * add comment for DEFAULT_INITIAL_DELAY_SECONDS * min_replica -> min_replcias * format * Apply suggestions from code review Co-authored-by: Wei-Lin Chiang * upd tgi example * upd examples * format, remove unnecessary refresh in sky serve logs, raise valueerror instead of click.secho red * add minimal http example * Apply suggestions from code review Co-authored-by: Wei-Lin Chiang * fix typo * Apply suggestions from code review --------- Co-authored-by: Wei-Lin Chiang --- sky/backends/backend_utils.py | 203 +++-- sky/backends/cloud_vm_ray_backend.py | 18 + sky/cli.py | 307 ++++++-- sky/core.py | 67 +- sky/execution.py | 302 ++++--- sky/global_user_state.py | 76 +- sky/serve/__init__.py | 5 +- sky/serve/autoscalers.py | 20 +- sky/serve/constants.py | 7 + sky/serve/control_plane.py | 80 +- .../examples/http_minimal/http_minimal.yaml | 10 + sky/serve/examples/http_minimal/index.html | 11 + sky/serve/examples/http_server/task.yaml | 10 +- sky/serve/examples/llama2/llama2.yaml | 8 +- .../examples/stable_diffusion_service.yaml | 8 +- sky/serve/examples/tgi_coder.yaml | 10 +- .../examples/{api_server.yaml => vllm.yaml} | 34 +- sky/serve/infra_providers.py | 744 +++++++++++------- sky/serve/load_balancers.py | 8 +- sky/serve/redirector.py | 6 +- sky/serve/serve_utils.py | 254 ++++++ sky/serve/service_spec.py | 162 +++- sky/setup_files/setup.py | 4 +- sky/status_lib.py | 49 +- sky/templates/skyserve-controller.yaml.j2 | 31 +- sky/utils/cli_utils/status_utils.py | 76 +- sky/utils/schemas.py | 53 +- 27 files changed, 1803 insertions(+), 760 deletions(-) create mode 100644 sky/serve/examples/http_minimal/http_minimal.yaml create mode 100644 sky/serve/examples/http_minimal/index.html rename sky/serve/examples/{api_server.yaml => vllm.yaml} (52%) create mode 100644 sky/serve/serve_utils.py diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index abcbe66ff71..40a6b6597cf 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,5 +1,5 @@ """Util constants/functions for the backends.""" -import base64 +import collections from datetime import datetime import difflib import enum @@ -7,7 +7,6 @@ import json import os import pathlib -import pickle import re import subprocess import tempfile @@ -105,10 +104,20 @@ spot_lib.SPOT_CONTROLLER_NAME: 'Managed spot controller' } +# Mapping from reserved cluster prefixes to the corresponding group name +# (logging purpose). +SKY_RESERVED_CLUSTER_PREFIXES: Dict[str, str] = { + serve_lib.CONTROLLER_PREFIX: 'SkyServe controller', +} + # Filelocks for the cluster status change. CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock') CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20 +# Filelocks for the service status change. +SERVICE_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.service.{}.lock') +SERVICE_STATUS_LOCK_TIMEOUT_SECONDS = 20 + # Remote dir that holds our runtime files. _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files' @@ -2401,60 +2410,139 @@ def _refresh_cluster(cluster_name): return kept_records +def _service_status_from_replica_info( + replica_info: List[Dict[str, Any]]) -> status_lib.ServiceStatus: + status2num = collections.Counter([i['status'] for i in replica_info]) + # If one replica is READY, the service is READY. + if status2num[status_lib.ReplicaStatus.READY] > 0: + return status_lib.ServiceStatus.READY + if (status2num[status_lib.ReplicaStatus.FAILED] + + status2num[status_lib.ReplicaStatus.FAILED_CLEANUP] > 0): + return status_lib.ServiceStatus.FAILED + return status_lib.ServiceStatus.REPLICA_INIT + + +def _check_controller_status_and_set_service_status( + service_name: str, cluster_name: str) -> Optional[str]: + cluster_record = global_user_state.get_cluster_from_name(cluster_name) + if (cluster_record is None or + cluster_record['status'] != status_lib.ClusterStatus.UP): + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + return f'Controller cluster {cluster_name!r} is not found or UP.' + return None + + +def _refresh_service_record_no_lock( + service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + record = global_user_state.get_service_from_name(service_name) + if record is None: + return None, None + + try: + check_network_connection() + except exceptions.NetworkError: + return record, 'Failed to refresh replica info due to network error.' + + if not record['endpoint']: + # Service controller is still initializing. Skipped refresh status. + return record, None + + controller_cluster_name = record['controller_cluster_name'] + handle = global_user_state.get_handle_from_cluster_name( + controller_cluster_name) + assert handle is not None + backend = get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + + code = serve_lib.ServeCodeGen.get_latest_info() + returncode, latest_info_payload, stderr = backend.run_on_head( + handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + if returncode != 0: + # If we cannot get the latest info, there are two possibilities: + # 1. The controller is not in a healthy state; + # 2. The control plane process somehow not respond to the request. + # For the first case, we want to catch the error and set the service + # status to CONTROLLER_FAILED. + # TODO(tian): Since we disabled sky down the controller, we might could + # assert cluster status is UP here and remove this function. + msg = _check_controller_status_and_set_service_status( + record['name'], controller_cluster_name) + if msg is None: + msg = ('Failed to refresh replica info from the controller. ' + f'Using the cached record. Reason: {stderr}') + return record, msg + + latest_info = serve_lib.load_latest_info(latest_info_payload) + record['replica_info'] = latest_info['replica_info'] + record['uptime'] = latest_info['uptime'] + + msg = None + # When the service is shutting down, there is a period of time which the + # control plane still responds to the request, and the replica is not + # terminated, so the return value for _service_status_from_replica_info + # will still be READY, but we don't want change service status to READY. + if record['status'] != status_lib.ServiceStatus.SHUTTING_DOWN: + new_status = _service_status_from_replica_info( + latest_info['replica_info']) + record['status'] = new_status + + global_user_state.add_or_update_service(**record) + + return record, msg + + +def _refresh_service_record( + service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + try: + # TODO(tian): remove pylint disabling when filelock + # version updated + # pylint: disable=abstract-class-instantiated + with filelock.FileLock(SERVICE_STATUS_LOCK_PATH.format(service_name), + SERVICE_STATUS_LOCK_TIMEOUT_SECONDS): + return _refresh_service_record_no_lock(service_name) + except filelock.Timeout: + msg = ('Failed get the lock for service ' + f'{service_name!r}. Using the cached record.') + return global_user_state.get_service_from_name(service_name), msg + + def refresh_service_status(service_name: Optional[str]) -> List[Dict[str, Any]]: if service_name is None: - service_records = global_user_state.get_services() + service_names = [ + record['name'] for record in global_user_state.get_services() + ] else: - service_record = global_user_state.get_service_from_name(service_name) - if service_record is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name} does not exist.') - service_records = [service_record] - # TODO(tian): Make it run in parallel. - for record in service_records: - endpoint = record['endpoint'] - if not endpoint: - continue - # TODO(tian): Refactor: store ip and app_port separately. - controller_ip = endpoint.split(':')[0] - controller_url = f'http://{controller_ip}:{serve_lib.CONTROL_PLANE_PORT}' - try: - resp = requests.get(controller_url + - '/control_plane/get_replica_nums', - timeout=5) - except requests.RequestException: - pass - else: - record.update(resp.json()) - if record['status'] != status_lib.ServiceStatus.SHUTTING_DOWN: - # TODO(tian): Current behaviour for user bugs in setup section - # is to teardown and relaunching forever. We should have a way - # to detect such bugs and stop relaunching. - if record['num_failed_replicas'] > 0: - record['status'] = status_lib.ServiceStatus.FAILED - elif record['num_ready_replicas'] > 0: - record['status'] = status_lib.ServiceStatus.READY - elif record['num_unhealthy_replicas'] > 0: - record['status'] = status_lib.ServiceStatus.REPLICA_INIT - global_user_state.add_or_update_service(**record) - if service_name is not None: - assert record['name'] == service_name - try: - resp = requests.get(controller_url + - '/control_plane/get_replica_info', - timeout=5) - except requests.RequestException: - pass - else: - record['replica_info'] = resp.json()['replica_info'] - decoded_info = [] - for info in record['replica_info']: - decoded_info.append({ - k: pickle.loads(base64.b64decode(v)) - for k, v in info.items() - }) - record['replica_info'] = decoded_info - return service_records + service_names = [service_name] + + plural = 's' if len(service_names) > 1 else '' + progress = rich_progress.Progress(transient=True, + redirect_stdout=False, + redirect_stderr=False) + task = progress.add_task( + (f'[bold cyan]Refreshing status for {len(service_names)} ' + f'service{plural}[/]'), + total=len(service_names)) + + def _refresh_service(service_name: str) -> Optional[Dict[str, Any]]: + record, msg = _refresh_service_record(service_name) + if msg is not None: + progress.stop() + print( + f'{colorama.Fore.YELLOW}Error occurred when refreshing service ' + f'{service_name}: {msg}{colorama.Style.RESET_ALL}') + progress.update(task, advance=1) + return record + + with progress: + updated_records = subprocess_utils.run_in_parallel( + _refresh_service, service_names) + + return [record for record in updated_records if record is not None] @typing.overload @@ -2523,7 +2611,7 @@ def get_task_resources_str(task: 'task_lib.Task') -> str: def check_cluster_name_not_reserved( cluster_name: Optional[str], operation_str: Optional[str] = None) -> None: - """Errors out if the cluster is a reserved cluster (spot controller). + """Errors out if the cluster name is reserved (e.g., spot/serve controller). Raises: sky.exceptions.NotSupportedError: if the cluster name is reserved, raise @@ -2532,9 +2620,16 @@ def check_cluster_name_not_reserved( Returns: None, if the cluster name is not reserved. """ + msg = None if cluster_name in SKY_RESERVED_CLUSTER_NAMES: msg = (f'Cluster {cluster_name!r} is reserved for the ' f'{SKY_RESERVED_CLUSTER_NAMES[cluster_name].lower()}.') + for prefix in SKY_RESERVED_CLUSTER_PREFIXES: + if cluster_name is not None and cluster_name.startswith(prefix): + msg = (f'Cluster prefix {prefix!r} is reserved for the ' + f'{SKY_RESERVED_CLUSTER_PREFIXES[prefix].lower()}.') + break + if msg is not None: if operation_str is not None: msg += f' {operation_str} is not allowed.' with ux_utils.print_exception_no_traceback(): diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 3adb6c1309b..a3d52343f8a 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3308,6 +3308,24 @@ def tail_spot_logs(self, stdin=subprocess.DEVNULL, ) + def tail_serve_logs(self, handle: CloudVmRayResourceHandle, + service_name: str, replica_id: int, + follow: bool) -> None: + code = serve_lib.ServeCodeGen.stream_logs(service_name, replica_id, + follow) + + signal.signal(signal.SIGINT, backend_utils.interrupt_handler) + signal.signal(signal.SIGTSTP, backend_utils.stop_handler) + + self.run_on_head( + handle, + code, + stream_logs=True, + process_stream=False, + ssh_mode=command_runner.SshMode.INTERACTIVE, + stdin=subprocess.DEVNULL, + ) + def teardown_no_lock(self, handle: CloudVmRayResourceHandle, terminate: bool, diff --git a/sky/cli.py b/sky/cli.py index 37dd68f9c3b..6ea02be99f6 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -52,6 +52,7 @@ from sky import global_user_state from sky import sky_logging from sky import spot as spot_lib +from sky import serve as serve_lib from sky import status_lib from sky.backends import backend_utils from sky.backends import onprem_utils @@ -135,6 +136,18 @@ def _get_glob_storages(storages: List[str]) -> List[str]: return list(set(glob_storages)) +def _get_glob_services(service_names: List[str]) -> List[str]: + """Returns a list of service names that match the glob pattern.""" + glob_service_names = [] + for service_name in service_names: + glob_service_name = global_user_state.get_glob_service_names( + service_name) + if not glob_service_name: + click.echo(f'Service {service_name!r} not found.') + glob_service_names.extend(glob_service_name) + return list(set(glob_service_names)) + + def _warn_if_local_cluster(cluster: str, local_clusters: List[str], message: str) -> bool: """Raises warning if the cluster name is a local cluster.""" @@ -2620,6 +2633,28 @@ def _down_or_stop_clusters( f'Skipping local cluster {c}, as it does not support ' '`sky stop/autostop`.')) ] + name_to_reserved_prefix = dict() + for name in names: + for prefix in backend_utils.SKY_RESERVED_CLUSTER_PREFIXES: + if name.startswith(prefix): + name_to_reserved_prefix[name] = prefix + break + names = [name for name in names if name not in name_to_reserved_prefix] + reserve_prefix_str = ', '.join( + [f'{prefix}*' for prefix in name_to_reserved_prefix.values()]) + if len(name_to_reserved_prefix) > 0: + if len(names) != 0: + names_str = ', '.join(map(repr, names)) + raise click.UsageError( + f'{operation} cluster(s) with reserved prefix ' + f'{reserve_prefix_str} with other cluster(s) ' + f'{names_str} is currently not supported.\n' + 'Please omit the cluster(s) with reserved prefix ' + f'{name_to_reserved_prefix.values()}.') + raise click.UsageError( + f'{operation} cluster(s) with reserved prefix ' + f'{reserve_prefix_str} is not supported. To teardown a ' + 'service, please use `sky serve down`.') # Make sure the reserved clusters are explicitly specified without other # normal clusters. if len(reserved_clusters) > 0: @@ -3817,7 +3852,7 @@ def serve(): type=str, **_get_shell_complete_args(_complete_file_name)) @click.option('--service-name', - '-s', + '-n', default=None, type=str, help='A service name. Unique for each service. If not provided, ' @@ -3833,9 +3868,9 @@ def serve_up( service_name: Optional[str], yes: bool, ): - """Launches a SkyServe instance. + """Launch a SkyServe service. - ENTRYPOINT must points to a valid YAML file. + ENTRYPOINT must point to a valid YAML file. Example: @@ -3846,8 +3881,18 @@ def serve_up( if service_name is None: service_name = backend_utils.generate_service_name() - if global_user_state.get_service_from_name(service_name) is not None: - click.secho(f'Service {service_name!r} already exists.', fg='red') + previous_service_record = global_user_state.get_service_from_name( + service_name) + if previous_service_record is not None: + if previous_service_record['status'] in [ + status_lib.ServiceStatus.CONTRLLER_FAILED, + status_lib.ServiceStatus.FAILED + ]: + prompt = (f'Service {service_name!r} has failed. ' + 'Please clean up the service and try again.') + else: + prompt = f'Service {service_name!r} already exists.' + click.secho(prompt, fg='red') return shell_splits = shlex.split(entrypoint) @@ -3906,17 +3951,60 @@ def serve_up( if len(dag.tasks) > 1: click.secho('Multiple tasks found in the YAML file.', fg='red') return - task = dag.tasks[0] + task: sky.Task = dag.tasks[0] if task.service is None: click.secho('Service section not found in the YAML file.', fg='red') return + assert len(task.resources) == 1 + if list(task.resources)[0].ports is not None: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Specifying ports in resources is not allowed. SkyServe will ' + 'use the port specified in the service section.') + return + + if task.service.controller_resources is not None: + controller_resources_config = task.service.controller_resources + else: + controller_resources_config = serve_lib.CONTROLLER_RESOURCES + try: + controller_resources = sky.Resources.from_yaml_config( + controller_resources_config) + except ValueError as e: + raise ValueError( + 'Encountered error when parsing controller resources') from e + if controller_resources.ports is not None: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Cannot specify ports in controller resources. SkyServe ' + 'will use the port specified in the service section.') + return + + click.secho('Service Spec:', fg='cyan') + click.echo(task.service) + + dummy_controller_task = sky.Task().set_resources(controller_resources) + click.secho('The controller will use the following resources:', fg='cyan') + with sky.Dag() as dag: + dag.add(dummy_controller_task) + sky.optimize(dag) + click.echo() + + dummy_controller_task: sky.Task = dag.tasks[0] + controller_best_resources = dummy_controller_task.best_resources + + click.secho('Each replica will use the following resource:', fg='cyan') + with sky.Dag() as dag: + dag.add(task) + sky.optimize(dag) + click.echo() if not yes: prompt = f'Launching a new service {service_name}. Proceed?' if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) - sky.serve_up(task, service_name) + sky.serve_up(task, service_name, controller_best_resources) @serve.command('status', cls=_DocumentedCodeCommand) @@ -3933,7 +4021,66 @@ def serve_up( @usage_lib.entrypoint # pylint: disable=redefined-builtin def serve_status(all: bool, service_name: Optional[str]): - """Show statuses of SkyServe services. + """Show statuses of SkyServe service. + + Show detailed statuses of the service. If SERVICE_NAME is not provided, + show all services' status. + + Each service can have one of the following statuses: + + - ``CONTROLLER_INIT``: The controller is initializing. + + - ``REPLICA_INIT``: The controller provisioning have succeeded; control + plane and redirector is alive, and there are no available replicas for + now. This also indicates that no replica failure has been detected. + + - ``CONTROLLER_FAILED``: The controller failed to start or in an abnormal + state; or the control plane and redirector is not alive. + + - ``READY``: The controller is ready to serve requests. This means that + at least one replica have passed the readiness probe. + + - ``SHUTTING_DOWN``: The controller is being shutting down. This usually + happens when the `sky serve down` command is called. + + - ``FAILED``: At least one replica failed and no replica is ready. This + could be caused by several reasons: + + - The launching process of the replica failed. + + - No readiness probe passed within initial delay seconds. + + - The replica continuously failed after serving requests for a while. + + - User code failed. + + Each replica can have one of the following statuses: + + - ``PROVISIONING``: The replica is being provisioned. + + - ``STARTING``: The replica provisioning have succeeded and the replica is + initializing its service, e.g., installing dependencies or loading model + weights. + + - ``READY``: The replica is ready to serve requests. + + - ``NOT_READY``: Currently, this replica failed the readiness probe but not + continuously failed for some time. This usually happens when the replica + is suffering from a bad network connection or there are too many requests + overwhelming the replica. + + - ``SHUTTING_DOWN``: The replica is being shutting down. This usually + happens when the replica is being scaled down or some error occurred. + SkyServe will terminate all replicas that have some error occurred. + + - ``FAILED``: Some error occurred when the replica is serving requests. + This indicates that the replica is already shut down. (Otherwise, it is + ``SHUTTING_DOWN``.) + + - ``FAILED_CLEANUP``: Some error occurred when the replica is shutting down. + This usually indicates some resources leakage happened since the + termination not finished correctly. When seeing this status, please login + to cloud console and check whether there are some resources not released. Examples: @@ -3945,35 +4092,36 @@ def serve_status(all: bool, service_name: Optional[str]): # Show detailed status for all services sky serve status -a \b - # Show service status and replica status for a specific service + # Only show status of my-service sky serve status my-service - \b - # Show detailed service status and replica status for a specific service - sky serve status my-service -a """ service_records = core.service_status(service_name) + if service_name is not None and not service_records: + click.secho(f'Service {service_name!r} not found.', fg='red') + return click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' f'{colorama.Style.RESET_ALL}') status_utils.show_service_table(service_records, all) - if service_name is not None: - # If service not exist, we should already raise an error in - # core.service_status. - assert len(service_records) == 1, service_records - service_record = service_records[0] - if 'replica_info' not in service_record: - click.secho(f'Failed to refresh status of service: {service_name}.', - fg='red') - return - click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Replicas of {service_name}{colorama.Style.RESET_ALL}') - status_utils.show_replica_table(service_record['replica_info'], all) + click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Replicas{colorama.Style.RESET_ALL}') + replica_infos = [] + for service_record in service_records: + for replica_record in service_record['replica_info']: + replica_record['service_name'] = service_record['name'] + replica_infos.append(replica_record) + status_utils.show_replica_table(replica_infos, all) @serve.command('down', cls=_DocumentedCodeCommand) -@click.argument('service_name', - required=True, - type=str, +@click.argument('service_names', + nargs=-1, + required=False, **_get_shell_complete_args(_complete_service_name)) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Stop all existing clusters.') @click.option('--yes', '-y', is_flag=True, @@ -3987,23 +4135,95 @@ def serve_status(all: bool, service_name: Optional[str]): required=False, help='Ignore errors (if any). ') def serve_down( - service_name: str, + service_names: List[str], + all: Optional[bool], # pylint: disable=redefined-builtin yes: bool, purge: bool, ): - """Stops a SkyServe instance. + """Tear down service(s). + + SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If + both SERVICE_NAMES and ``--all`` are supplied, the latter takes precedence. + + Tear down a service will delete all associated resources, including the + controller VM and all replicas. Example: .. code-block:: bash + # Tear down a specific service. sky serve down my-service + \b + # Tear down multiple services. + sky serve down my-service1 my-service2 + \b + # Tear down all services matching glob pattern 'service-*'. + sky serve down "service-*" + \b + # Tear down all existing services. + sky serve down -a + """ + if not service_names and not all: + raise click.UsageError( + '`sky serve down` requires either a service name (see ' + '`sky serve status`) or --all to be specified.') + if all: + service_names = [ + record['name'] for record in global_user_state.get_services() + ] + else: + service_names = _get_glob_services(service_names) + + if not service_names: + click.echo('\nService(s) not found (tip: see `sky serve status`).') + return + + plural = '' if len(service_names) == 1 else 's' if not yes: - prompt = f'Tearing down service {service_name}. Proceed?' - click.confirm(prompt, default=True, abort=True, show_default=True) + service_name_list = ', '.join(service_names) + click.confirm( + f'Tearing down {len(service_names)} service{plural}: ' + f'{service_name_list}. Proceed?', + default=True, + abort=True, + show_default=True) - sky.serve_down(service_name, purge) + progress = rich_progress.Progress(transient=True, + redirect_stdout=False, + redirect_stderr=False) + task = progress.add_task( + f'[bold cyan]Tearing down {len(service_names)} service{plural}[/]', + total=len(service_names)) + + def _down_service(name: str): + success_progress = False + try: + sky.serve_down(name, purge) + except RuntimeError as e: + message = (f'{colorama.Fore.RED}Teardown service {name}...failed. ' + f'{colorama.Style.RESET_ALL}' + f'\nReason: {common_utils.format_exception(e)}.') + except (exceptions.NotSupportedError, + exceptions.ClusterOwnerIdentityMismatchError) as e: + message = str(e) + else: + message = (f'{colorama.Fore.GREEN}Teardown service {name}...done.' + f'{colorama.Style.RESET_ALL}') + success_progress = True + + progress.stop() + click.echo(message) + if success_progress: + progress.update(task, advance=1) + progress.start() + + with progress: + subprocess_utils.run_in_parallel(_down_service, service_names) + progress.live.transient = False + # Make sure the progress bar not mess up the terminal. + progress.refresh() @serve.command('logs', cls=_DocumentedCodeCommand) @@ -4014,33 +4234,27 @@ def serve_down( help=('Follow the logs of the job. [default: --follow] ' 'If --no-follow is specified, print the log so far and exit.')) @click.option('--control-plane', - '-c', is_flag=True, default=False, required=False, help='Show the control plane logs of this service.') @click.option('--redirector', - '-r', is_flag=True, default=False, required=False, help='Show the redirector logs of this service.') -@click.option('--replica-id', - '-i', - default=None, - required=False, - help='Show the logs of a specific replica.') @click.argument('service_name', required=True, type=str, **_get_shell_complete_args(_complete_service_name)) +@click.argument('replica_id', required=False, type=int) @usage_lib.entrypoint def serve_logs( service_name: str, follow: bool, control_plane: bool, redirector: bool, - replica_id: Optional[str], + replica_id: Optional[int], ): """Tail the log of a service. @@ -4049,10 +4263,13 @@ def serve_logs( .. code-block:: bash # Tail the control plane logs of a service - sky serve logs -c [SERVICE_ID] + sky serve logs --control-plane [SERVICE_ID] \b # Print the redirector logs so far and exit - sky serve logs -r --no-follow [SERVICE_ID] + sky serve logs --redirector --no-follow [SERVICE_ID] + \b + # Tail the logs of replica 1 + sky serve logs [SERVICE_ID] 1 """ have_replica_id = replica_id is not None if (control_plane + redirector + have_replica_id) != 1: @@ -4069,10 +4286,10 @@ def serve_logs( controller_name = service_record['controller_cluster_name'] if control_plane: core.tail_logs(controller_name, job_id=1, follow=follow) - if redirector: + elif redirector: core.tail_logs(controller_name, job_id=2, follow=follow) - if have_replica_id: - raise NotImplementedError + else: + core.serve_tail_logs(service_record, replica_id, follow=follow) # ============================== diff --git a/sky/core.py b/sky/core.py index b47cfb79df6..793e6a711da 100644 --- a/sky/core.py +++ b/sky/core.py @@ -114,6 +114,30 @@ def service_status(service_name: Optional[str]) -> List[Dict[str, Any]]: return backend_utils.refresh_service_status(service_name) +@usage_lib.entrypoint +def serve_tail_logs(service_record: Dict[str, Any], replica_id: int, + follow: bool) -> None: + service_name = service_record['name'] + if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Service {service_name!r} is still initializing its ' + 'controller. Please try again later.') + if service_record['status'] == status_lib.ServiceStatus.CONTRLLER_FAILED: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name!r}\'s controller failed. ' + 'Cannot tail logs.') + controller_cluster_name = service_record['controller_cluster_name'] + handle = global_user_state.get_handle_from_cluster_name( + controller_cluster_name) + if handle is None: + raise ValueError(f'Cannot find controller for service {service_name}.') + assert isinstance(handle, backends.CloudVmRayResourceHandle), handle + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend), backend + backend.tail_serve_logs(handle, service_name, replica_id, follow=follow) + + @usage_lib.entrypoint def cost_report() -> List[Dict[str, Any]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. @@ -536,6 +560,7 @@ def cancel( job_ids: Optional[List[int]] = None, # pylint: disable=invalid-name _try_cancel_if_cluster_is_init: bool = False, + _from_serve_core: bool = False, ) -> None: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Cancel jobs on a cluster. @@ -562,8 +587,10 @@ def cancel( 'sky cancel requires either a job id ' f'(see `sky queue {cluster_name} -s`) or the --all flag.') - backend_utils.check_cluster_name_not_reserved( - cluster_name, operation_str='Cancelling jobs') + if not _from_serve_core: + # Skip name checking when the call is from serve core. + backend_utils.check_cluster_name_not_reserved( + cluster_name, operation_str='Cancelling jobs') # Check the status of the cluster. handle = None @@ -591,17 +618,20 @@ def cancel( backend = backend_utils.get_backend_from_handle(handle) if all: - sky_logging.print(f'{colorama.Fore.YELLOW}' - f'Cancelling all jobs on cluster {cluster_name!r}...' - f'{colorama.Style.RESET_ALL}') + if not _from_serve_core: + sky_logging.print( + f'{colorama.Fore.YELLOW}' + f'Cancelling all jobs on cluster {cluster_name!r}...' + f'{colorama.Style.RESET_ALL}') job_ids = None else: assert job_ids is not None, 'job_ids should not be None' - jobs_str = ', '.join(map(str, job_ids)) - sky_logging.print( - f'{colorama.Fore.YELLOW}' - f'Cancelling jobs ({jobs_str}) on cluster {cluster_name!r}...' - f'{colorama.Style.RESET_ALL}') + if not _from_serve_core: + jobs_str = ', '.join(map(str, job_ids)) + sky_logging.print( + f'{colorama.Fore.YELLOW}' + f'Cancelling jobs ({jobs_str}) on cluster {cluster_name!r}...' + f'{colorama.Style.RESET_ALL}') backend.cancel_jobs(handle, job_ids) @@ -688,10 +718,12 @@ def download_logs( @usage_lib.entrypoint -def job_status(cluster_name: str, - job_ids: Optional[List[int]], - stream_logs: bool = False - ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]: +def job_status( + cluster_name: str, + job_ids: Optional[List[int]], + silent: bool = False, + stream_logs: bool = False +) -> Dict[Optional[int], Optional[job_lib.JobStatus]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Get the status of jobs. @@ -728,9 +760,10 @@ def job_status(cluster_name: str, if job_ids is not None and len(job_ids) == 0: return {} - sky_logging.print(f'{colorama.Fore.YELLOW}' - 'Getting job status...' - f'{colorama.Style.RESET_ALL}') + if not silent: + sky_logging.print(f'{colorama.Fore.YELLOW}' + 'Getting job status...' + f'{colorama.Style.RESET_ALL}') usage_lib.record_cluster_name_for_current_operation(cluster_name) statuses = backend.get_job_status(handle, job_ids, stream_logs=stream_logs) diff --git a/sky/execution.py b/sky/execution.py index 2ba9bfe3139..854f4cc182f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -16,7 +16,9 @@ import enum import getpass import requests +from rich import console as rich_console import tempfile +import time import os import uuid from typing import Any, Dict, List, Optional, Union @@ -42,6 +44,7 @@ from sky.data import storage as storage_lib from sky.usage import usage_lib from sky.skylet import constants +from sky.skylet import job_lib from sky.utils import common_utils from sky.utils import dag_utils from sky.utils import log_utils @@ -383,8 +386,12 @@ def _execute( env = dict(os.environ, **{env_options.Options.DISABLE_LOGGING.value: '1'}) subprocess_utils.run('sky status --no-show-spot-jobs', env=env) - print() - print('\x1b[?25h', end='') # Show cursor. + # UX: Don't show cursor if we are initializing a skyserve controller, + # since it will mess up the progress bar. + if (cluster_name is None or + not cluster_name.startswith(serve.CONTROLLER_PREFIX)): + print() + print('\x1b[?25h', end='') # Show cursor. @timeline.event @@ -953,6 +960,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): def serve_up( task: 'sky.Task', service_name: str, + controller_best_resources: 'sky.Resources', ): """Serve up a service. @@ -970,9 +978,9 @@ def serve_up( assert len(task.resources) == 1 requested_resources = list(task.resources)[0] global_user_state.add_or_update_service( - service_name, controller_cluster_name, '', - status_lib.ServiceStatus.CONTROLLER_INIT, 0, 0, 0, policy, - requested_resources) + service_name, None, controller_cluster_name, '', + status_lib.ServiceStatus.CONTROLLER_INIT, policy, requested_resources, + []) app_port = int(task.service.app_port) assert len(task.resources) == 1, task original_resources = list(task.resources)[0] @@ -984,7 +992,8 @@ def serve_up( task.set_resources(original_resources.copy(ports=[app_port])) # TODO(tian): Use skyserve constants. - # TODO(tian): Clean up storage when the service is torn down. + # The storage will be cleaned up by the control plane `terminate` method + # after the service is terminated. _maybe_translate_local_file_mounts_and_sync_up(task) with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', @@ -997,12 +1006,8 @@ def serve_up( remote_task_yaml_path = (serve.SERVICE_YAML_PREFIX + f'/service_{service_name}.yaml') vars_to_fill = { - 'ports': [app_port, serve.CONTROL_PLANE_PORT], 'remote_task_yaml_path': remote_task_yaml_path, 'local_task_yaml_path': f.name, - 'is_dev': env_options.Options.IS_DEVELOPER.get(), - 'is_debug': env_options.Options.SHOW_DEBUG_INFO.get(), - 'disable_logging': env_options.Options.DISABLE_LOGGING.get(), } controller_yaml_path = os.path.join(serve.CONTROLLER_YAML_PREFIX, f'{service_name}.yaml') @@ -1010,7 +1015,18 @@ def serve_up( vars_to_fill, output_path=controller_yaml_path) controller_task = task_lib.Task.from_yaml(controller_yaml_path) - assert len(controller_task.resources) == 1, controller_task + controller_task.best_resources = (controller_best_resources.copy( + ports=[app_port])) + + controller_envs = { + 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': True, + 'SKYPILOT_DEV': env_options.Options.IS_DEVELOPER.get(), + 'SKYPILOT_DEBUG': env_options.Options.SHOW_DEBUG_INFO.get(), + 'SKYPILOT_DISABLE_USAGE_COLLECTION': + env_options.Options.DISABLE_LOGGING.get(), + } + controller_task.update_envs(controller_envs) + print(f'{colorama.Fore.YELLOW}' f'Launching controller for {service_name}...' f'{colorama.Style.RESET_ALL}') @@ -1022,86 +1038,124 @@ def serve_up( retry_until_up=True, ) - handle = global_user_state.get_handle_from_cluster_name( + cluster_record = global_user_state.get_cluster_from_name( controller_cluster_name) + if (cluster_record is None or + cluster_record['status'] != status_lib.ClusterStatus.UP): + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + print(f'{colorama.Fore.RED}Controller failed to launch. ' + f'Please check the logs above.{colorama.Style.RESET_ALL}') + return + + handle = cluster_record['handle'] assert isinstance(handle, backends.CloudVmRayResourceHandle) endpoint = f'{handle.head_ip}:{task.service.app_port}' - controller_envs = { - 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': True, - 'SKYPILOT_DEV': env_options.Options.IS_DEVELOPER.get(), - 'SKYPILOT_DEBUG': env_options.Options.SHOW_DEBUG_INFO.get(), - 'SKYPILOT_DISABLE_USAGE_COLLECTION': - env_options.Options.DISABLE_LOGGING.get(), - } + console = rich_console.Console() + + def _wait_until_job_is_running(cluster_name: str, + job_id: int, + retry_time: int = 30) -> bool: + for _ in range(retry_time): + job_statuses = core.job_status(cluster_name, [job_id], + silent=True) + job_status = job_statuses.get(str(job_id), None) + if job_status == job_lib.JobStatus.RUNNING: + return True + time.sleep(1) + return False # NOTICE: The job submission order cannot be changed since the # `sky serve logs` CLI will identify the control plane job with # the first job submitted and the redirector job with the second # job submitted. - print( - f'{colorama.Fore.YELLOW}' - 'Launching control plane process on controller...' - f'{colorama.Style.RESET_ALL}', - end='') - _execute( - entrypoint=sky.Task( - name='run-control-plane', - envs=controller_envs, - run='python -m sky.serve.control_plane --service-name ' - f'{service_name} --task-yaml {remote_task_yaml_path} ' - f'--port {serve.CONTROL_PLANE_PORT}'), - stream_logs=False, - handle=handle, - stages=[Stage.EXEC], - cluster_name=controller_cluster_name, - detach_run=True, - ) + with console.status('[yellow]Launching control plane process on ' + 'controller...[/yellow]'): + _execute( + entrypoint=sky.Task( + name='run-control-plane', + envs=controller_envs, + run='python -m sky.serve.control_plane --service-name ' + f'{service_name} --task-yaml {remote_task_yaml_path} ' + f'--port {serve.CONTROL_PLANE_PORT}'), + stream_logs=False, + handle=handle, + stages=[Stage.EXEC], + cluster_name=controller_cluster_name, + detach_run=True, + ) + control_plane_job_is_running = _wait_until_job_is_running( + controller_cluster_name, 1) + if not control_plane_job_is_running: + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + print(f'{colorama.Fore.RED}Control plane failed to launch. ' + f'Please check the logs with sky serve logs {service_name} ' + f'--control-plane{colorama.Style.RESET_ALL}') + return + print(f'{colorama.Fore.GREEN}Control plane process is running.' + f'{colorama.Style.RESET_ALL}') - print( - f'{colorama.Fore.YELLOW}' - 'Launching redirector process on controller...' - f'{colorama.Style.RESET_ALL}', - end='') - control_plane_addr = f'http://0.0.0.0:{serve.CONTROL_PLANE_PORT}' - _execute( - entrypoint=sky.Task( - name='run-redirector', - envs=controller_envs, - run='python -m sky.serve.redirector --task-yaml ' - f'{remote_task_yaml_path} --port {app_port} ' - f'--control-plane-addr {control_plane_addr}'), - stream_logs=False, - handle=handle, - stages=[Stage.EXEC], - cluster_name=controller_cluster_name, - detach_run=True, - ) + with console.status('[yellow]Launching redirector process on ' + 'controller...[/yellow]'): + control_plane_addr = f'http://localhost:{serve.CONTROL_PLANE_PORT}' + _execute( + entrypoint=sky.Task( + name='run-redirector', + envs=controller_envs, + run='python -m sky.serve.redirector --task-yaml ' + f'{remote_task_yaml_path} --port {app_port} ' + f'--control-plane-addr {control_plane_addr}'), + stream_logs=False, + handle=handle, + stages=[Stage.EXEC], + cluster_name=controller_cluster_name, + detach_run=True, + ) + redirector_job_is_running = _wait_until_job_is_running( + controller_cluster_name, 2) + if not redirector_job_is_running: + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + print(f'{colorama.Fore.RED}Redirector failed to launch. ' + f'Please check the logs with sky serve logs {service_name} ' + f'--redirector{colorama.Style.RESET_ALL}') + return + print(f'{colorama.Fore.GREEN}Redirector process is running.' + f'{colorama.Style.RESET_ALL}') - global_user_state.add_or_update_service( - service_name, controller_cluster_name, endpoint, - status_lib.ServiceStatus.REPLICA_INIT, 0, 0, 0, policy, - requested_resources) + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.REPLICA_INIT) + global_user_state.set_service_endpoint(service_name, endpoint) - print(f'{colorama.Style.BRIGHT}{colorama.Fore.CYAN}' - 'Gateway endpoint serving at ' - f'{colorama.Style.RESET_ALL}{colorama.Fore.CYAN}' - f'{endpoint}.' - f'{colorama.Style.RESET_ALL}') print(f'\n{colorama.Fore.CYAN}Service name: ' f'{colorama.Style.BRIGHT}{service_name}{colorama.Style.RESET_ALL}' - '\nTo see detailed info about replicas:' - f'\t{backend_utils.BOLD}sky serve status {service_name} (-a)' + '\nTo see detailed info:' + f'\t\t{backend_utils.BOLD}sky serve status {service_name} (-a)' f'{backend_utils.RESET_BOLD}' '\nTo see logs of controller:' - f'\t\t{backend_utils.BOLD}sky serve logs -c {service_name}' - f'{backend_utils.RESET_BOLD}' + f'\t{backend_utils.BOLD}sky serve logs --control-plane ' + f'{service_name}{backend_utils.RESET_BOLD}' '\nTo see logs of redirector:' - f'\t\t{backend_utils.BOLD}sky serve logs -r {service_name}' - f'{backend_utils.RESET_BOLD}' + f'\t{backend_utils.BOLD}sky serve logs --redirector ' + f'{service_name}{backend_utils.RESET_BOLD}' + '\nTo see logs of one replica:' + f'\t{backend_utils.BOLD}sky serve logs {service_name} ' + f'[REPLICA_ID]{backend_utils.RESET_BOLD}' '\nTo teardown the service:' - f'\t\t{backend_utils.BOLD}sky serve down {service_name}' - f'{backend_utils.RESET_BOLD}') + f'\t{backend_utils.BOLD}sky serve down {service_name}' + f'{backend_utils.RESET_BOLD}' + f'\n(use {backend_utils.BOLD}sky serve status {service_name}' + f'{backend_utils.RESET_BOLD} to get all valid REPLICA_ID)') + print(f'\n{colorama.Style.BRIGHT}{colorama.Fore.CYAN}' + 'Endpoint URL: ' + f'{colorama.Style.RESET_ALL}{colorama.Fore.CYAN}' + f'{endpoint}' + f'{colorama.Style.RESET_ALL}') + print(f'{colorama.Fore.GREEN}Starting replica now...' + f'{colorama.Style.RESET_ALL}') + print('Please use the above command to find the latest status.') def serve_down( @@ -1113,74 +1167,82 @@ def serve_down( Please refer to the sky.cli.serve_down for the document. Args: - name: Name of the service. + service_name: Name of the service. - Raises: + purge: If true, ignore errors when cleaning up the controller. """ service_record = global_user_state.get_service_from_name(service_name) - if service_record is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name} does not exist.') + # Already filered all inexist service in cli.py + assert service_record is not None, service_name controller_cluster_name = service_record['controller_cluster_name'] - num_ready_replicas = service_record['num_ready_replicas'] - num_unhealthy_replicas = service_record['num_unhealthy_replicas'] - num_failed_replicas = service_record['num_failed_replicas'] - num_replicas = (num_ready_replicas + num_unhealthy_replicas + - num_failed_replicas) - controller_ip = service_record['endpoint'].split(':')[0] - controller_url = f'http://{controller_ip}:{serve.CONTROL_PLANE_PORT}' - handle = global_user_state.get_handle_from_cluster_name( - controller_cluster_name) global_user_state.set_service_status(service_name, status_lib.ServiceStatus.SHUTTING_DOWN) + handle = global_user_state.get_handle_from_cluster_name( + controller_cluster_name) - try: - if handle is not None: - plural = '' - if num_replicas > 1: - plural = 's' - print(f'{colorama.Fore.YELLOW}' - f'Tearing down {num_replicas} replica{plural}...' - f'{colorama.Style.RESET_ALL}') - resp = requests.post(controller_url + '/control_plane/terminate', - data='') + if handle is not None: + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + try: + code = serve.ServeCodeGen.terminate_service() + returncode, terminate_service_payload, stderr = backend.run_on_head( + handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + try: + subprocess_utils.handle_returncode( + returncode, + code, + f'Failed to terminate service {service_name}', + stderr, + stream_logs=False) + except exceptions.CommandError as e: + raise RuntimeError(e.error_msg) from e + resp = serve.load_terminate_service_result( + terminate_service_payload) if resp.status_code != 200: - raise RuntimeError('Failed to terminate replica due to ' - f'request failure: {resp.text}') + raise RuntimeError('Failed to terminate replica of service ' + f'{service_name} due to request ' + f'failure: {resp.text}') msg = resp.json()['message'] if msg: raise RuntimeError( - 'Unexpected message when tearing down ' - f'replica: {msg}. Please login to the controller ' + 'Unexpected message when tearing down replica of service ' + f'{service_name}: {msg}. Please login to the controller ' 'and make sure the service is properly cleaned.') - except (RuntimeError, ValueError, requests.exceptions.ConnectionError) as e: - if purge: - logger.warning(f'Ignoring error when cleaning controller: {e}') - else: - raise e + except (RuntimeError, ValueError, + requests.exceptions.ConnectionError) as e: + if purge: + logger.warning('Ignoring error when cleaning replicas of ' + f'{service_name}: {e}') + else: + raise RuntimeError() from e + else: + if not purge: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'Cannot find controller cluster of service {service_name}.' + ) try: - print( - f'{colorama.Fore.YELLOW}' - f'Stopping control plane and redirector processes on controller...' - f'{colorama.Style.RESET_ALL}') - core.cancel(controller_cluster_name, all=True) + core.cancel(controller_cluster_name, all=True, _from_serve_core=True) except (ValueError, sky.exceptions.ClusterNotUpError) as e: if purge: - logger.warning(f'Ignoring error when stopping controller: {e}') + logger.warning('Ignoring error when stopping control plane and ' + f'redirector jobs of service {service_name}: {e}') else: - raise e + raise RuntimeError() from e try: - print(f'{colorama.Fore.YELLOW}' - 'Tearing down controller...' - f'{colorama.Style.RESET_ALL}') core.down(controller_cluster_name, purge=purge) except (RuntimeError, ValueError) as e: if purge: - logger.warning(f'Ignoring error when cleaning controller: {e}') + logger.warning('Ignoring error when terminating controller VM of ' + f'service {service_name}: {e}') else: - raise e + raise RuntimeError() from e # TODO(tian): Maybe add a post_cleanup function? controller_yaml_path = os.path.join(serve.CONTROLLER_YAML_PREFIX, @@ -1188,7 +1250,3 @@ def serve_down( if os.path.exists(controller_yaml_path): os.remove(controller_yaml_path) global_user_state.remove_service(service_name) - - print(f'{colorama.Fore.GREEN}' - f'The tearing down of service {service_name} is done.' - f'{colorama.Style.RESET_ALL}') diff --git a/sky/global_user_state.py b/sky/global_user_state.py index c1b155437ef..7d65205b970 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -97,14 +97,13 @@ def create_table(cursor, conn): cursor.execute("""\ CREATE TABLE IF NOT EXISTS services ( name TEXT PRIMARY KEY, + uptime INTEGER, controller_cluster_name TEXT, endpoint TEXT, status TEXT, - num_ready_replicas INTEGER DEFAULT 0, - num_unhealthy_replicas INTEGER DEFAULT 0, - num_failed_replicas INTEGER DEFAULT 0, policy TEXT, - requested_resources BLOB)""") + requested_resources BLOB, + replica_info BLOB)""") # For backward compatibility. # TODO(zhwu): Remove this function after all users have migrated to # the latest version of SkyPilot. @@ -286,54 +285,48 @@ def add_or_update_cluster(cluster_name: str, def add_or_update_service( - name: str, controller_cluster_name: str, endpoint: str, - status: status_lib.ServiceStatus, num_ready_replicas: int, - num_unhealthy_replicas: int, num_failed_replicas, policy: str, - requested_resources: Optional['resources_lib.Resources']): + name: str, uptime: Optional[int], controller_cluster_name: str, + endpoint: str, status: status_lib.ServiceStatus, policy: str, + requested_resources: Optional['resources_lib.Resources'], + replica_info: List[Dict[str, Any]]) -> None: _DB.cursor.execute( 'INSERT or REPLACE INTO services' - '(name, controller_cluster_name, endpoint, status, ' - 'num_ready_replicas, num_unhealthy_replicas, ' - 'num_failed_replicas, policy, requested_resources) ' + '(name, uptime, controller_cluster_name, endpoint, ' + 'status, policy, requested_resources, replica_info) ' 'VALUES (' # name '?, ' + # uptime + '?, ' # controller_cluster_name '?, ' # endpoint '?, ' # status '?, ' - # num_ready_replicas - '?, ' - # num_unhealthy_replicas - '?, ' - # num_failed_replicas - '?, ' # policy '?, ' # requested_resources + '?, ' + # replica_info '?' ')', ( # name name, + # uptime + uptime, # controller_cluster_name controller_cluster_name, # endpoint endpoint, # status status.value, - # num_ready_replicas - num_ready_replicas, - # num_unhealthy_replicas - num_unhealthy_replicas, - # num_failed_replicas - num_failed_replicas, # policy policy, # requested_resources pickle.dumps(requested_resources), + pickle.dumps(replica_info), )) _DB.conn.commit() @@ -395,6 +388,16 @@ def set_service_status(service_name: str, status: status_lib.ServiceStatus): raise ValueError(f'Service {service_name} not found.') +def set_service_endpoint(service_name: str, endpoint: str): + _DB.cursor.execute('UPDATE services SET endpoint=(?) ' + 'WHERE name=(?)', (endpoint, service_name)) + count = _DB.cursor.rowcount + _DB.conn.commit() + assert count <= 1, count + if count == 0: + raise ValueError(f'Service {service_name} not found.') + + def get_handle_from_cluster_name( cluster_name: str) -> Optional['backends.ResourceHandle']: assert cluster_name is not None, 'cluster_name cannot be None' @@ -412,6 +415,13 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]: return [row[0] for row in rows] +def get_glob_service_names(service_name: str) -> List[str]: + assert service_name is not None, 'service_name cannot be None' + rows = _DB.cursor.execute('SELECT name FROM services WHERE name GLOB (?)', + (service_name,)) + return [row[0] for row in rows] + + def set_cluster_status(cluster_name: str, status: status_lib.ClusterStatus) -> None: _DB.cursor.execute('UPDATE clusters SET status=(?) WHERE name=(?)', ( @@ -624,20 +634,18 @@ def get_service_from_name( # Explicitly specify the number of fields to unpack, so that # we can add new fields to the database in the future without # breaking the previous code. - (name, controller_cluster_name, endpoint, status, num_ready_replicas, - num_unhealthy_replicas, num_failed_replicas, policy, - requested_resources) = row[:9] + (name, uptime, controller_cluster_name, endpoint, status, policy, + requested_resources, replica_info) = row[:8] # TODO: use namedtuple instead of dict record = { 'name': name, + 'uptime': uptime, 'controller_cluster_name': controller_cluster_name, 'endpoint': endpoint, 'status': status_lib.ServiceStatus[status], - 'num_ready_replicas': num_ready_replicas, - 'num_unhealthy_replicas': num_unhealthy_replicas, - 'num_failed_replicas': num_failed_replicas, 'policy': policy, 'requested_resources': pickle.loads(requested_resources), + 'replica_info': pickle.loads(replica_info), } return record return None @@ -673,21 +681,19 @@ def get_services() -> List[Dict[str, Any]]: rows = _DB.cursor.execute('select * from services').fetchall() records = [] for row in rows: - (name, controller_cluster_name, endpoint, status, num_ready_replicas, - num_unhealthy_replicas, num_failed_replicas, policy, - requested_resources) = row[:9] + (name, uptime, controller_cluster_name, endpoint, status, policy, + requested_resources, replica_info) = row[:8] # TODO: use namedtuple instead of dict record = { 'name': name, + 'uptime': uptime, 'controller_cluster_name': controller_cluster_name, 'endpoint': endpoint, 'status': status_lib.ServiceStatus[status], - 'num_ready_replicas': num_ready_replicas, - 'num_unhealthy_replicas': num_unhealthy_replicas, - 'num_failed_replicas': num_failed_replicas, 'policy': policy, 'requested_resources': pickle.loads(requested_resources), + 'replica_info': pickle.loads(replica_info), } records.append(record) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index d2aa7a87b70..c8779ec3f1b 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -1,5 +1,8 @@ """Modules for SkyServe services.""" from sky.serve.constants import (CONTROLLER_PREFIX, CONTROLLER_TEMPLATE, CONTROLLER_YAML_PREFIX, SERVICE_YAML_PREFIX, - CONTROL_PLANE_PORT) + CONTROL_PLANE_PORT, CONTROLLER_RESOURCES) from sky.serve.service_spec import SkyServiceSpec +from sky.serve.serve_utils import ServeCodeGen +from sky.serve.serve_utils import load_latest_info +from sky.serve.serve_utils import load_terminate_service_result diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 44277ab4dd7..314e186d92c 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -39,25 +39,25 @@ def scale_down(self, num_nodes_to_remove: int) -> None: logger.debug(f'Scaling down by {num_nodes_to_remove} nodes') self.infra_provider.scale_down(num_nodes_to_remove) - def monitor(self) -> None: + def run(self) -> None: logger.info('Starting autoscaler monitor.') - while not self.monitor_thread_stop_event.is_set(): + while not self.run_thread_stop_event.is_set(): try: self.evaluate_scaling() except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # monitor running. - logger.error(f'Error in autoscaler monitor: {e}') + logger.error(f'Error in autoscaler: {e}') time.sleep(self.frequency) - def start_monitor(self) -> None: - self.monitor_thread_stop_event = threading.Event() - self.monitor_thread = threading.Thread(target=self.monitor) - self.monitor_thread.start() + def start(self) -> None: + self.run_thread_stop_event = threading.Event() + self.run_thread = threading.Thread(target=self.run) + self.run_thread.start() - def terminate_monitor(self) -> None: - self.monitor_thread_stop_event.set() - self.monitor_thread.join() + def terminate(self) -> None: + self.run_thread_stop_event.set() + self.run_thread.join() class RequestRateAutoscaler(Autoscaler): diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 19792331093..3d4ba54cfd4 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -9,3 +9,10 @@ CONTROL_PLANE_PORT = 31001 CONTROL_PLANE_SYNC_INTERVAL = 20 + +CONTROLLER_RESOURCES = {'disk_size': 100, 'cpus': '4+'} + +# A period of time to initialize your service. Any readiness probe failures +# during this period will be ignored. +DEFAULT_INITIAL_DELAY_SECONDS = 1200 +DEFAULT_MIN_REPLICAS = 1 diff --git a/sky/serve/control_plane.py b/sky/serve/control_plane.py index 21fb5c2b4fe..3d69f5f872f 100644 --- a/sky/serve/control_plane.py +++ b/sky/serve/control_plane.py @@ -3,14 +3,19 @@ Responsible for autoscaling and replica management. """ import argparse +import base64 import fastapi import logging +import pickle from typing import Optional import uvicorn +import sky +from sky import backends from sky import serve from sky.serve import autoscalers from sky.serve import infra_providers +from sky.utils import env_options logging.basicConfig( level=logging.INFO, @@ -20,6 +25,13 @@ logger = logging.getLogger(__name__) +class SuppressSuccessGetAccessLogsFilter(logging.Filter): + + def filter(self, record: logging.LogRecord) -> bool: + message = record.getMessage() + return not ('GET' in message and '200' in message) + + class ControlPlane: """Control Plane: control everything about replica. @@ -30,18 +42,19 @@ class ControlPlane: def __init__(self, port: int, + task_yaml: str, infra_provider: infra_providers.InfraProvider, autoscaler: Optional[autoscalers.Autoscaler] = None) -> None: self.port = port + self.task_yaml = task_yaml self.infra_provider = infra_provider self.autoscaler = autoscaler self.app = fastapi.FastAPI() - # TODO(tian): Authentication!!! def run(self) -> None: - @self.app.post('/control_plane/get_num_requests') - async def get_num_requests(request: fastapi.Request): + @self.app.post('/control_plane/update_num_requests') + async def update_num_requests(request: fastapi.Request): # await request request_data = await request.json() # get request data @@ -61,41 +74,51 @@ def get_autoscaler_query_interval(): def get_ready_replicas(): return {'ready_replicas': self.infra_provider.get_ready_replicas()} - @self.app.get('/control_plane/get_replica_info') - def get_replica_info(): - return {'replica_info': self.infra_provider.get_replica_info()} - - @self.app.get('/control_plane/get_replica_nums') - def get_replica_nums(): - return { - 'num_ready_replicas': self.infra_provider.ready_replica_num(), - 'num_unhealthy_replicas': - self.infra_provider.unhealthy_replica_num(), - 'num_failed_replicas': self.infra_provider.failed_replica_num() + @self.app.get('/control_plane/get_latest_info') + def get_latest_info(): + latest_info = { + 'replica_info': + self.infra_provider.get_replica_info(verbose=True), + 'uptime': self.infra_provider.get_uptime(), } + latest_info = { + k: base64.b64encode(pickle.dumps(v)).decode('utf-8') + for k, v in latest_info.items() + } + return latest_info @self.app.post('/control_plane/terminate') def terminate(request: fastapi.Request): del request - # request_data = request.json() - # TODO(tian): Authentication!!! logger.info('Terminating service...') - self.infra_provider.terminate_replica_fetcher() if self.autoscaler is not None: - self.autoscaler.terminate_monitor() + logger.info('Terminate autoscaler...') + self.autoscaler.terminate() msg = self.infra_provider.terminate() + # Cleanup cloud storage + # TODO(tian): move to local serve_down so that we can cleanup + # local storage cache as well. + task = sky.Task.from_yaml(self.task_yaml) + backend = backends.CloudVmRayBackend() + backend.teardown_ephemeral_storage(task) return {'message': msg} - # Run replica_monitor and autoscaler.monitor (if autoscaler is defined) + # Run replica_prober and autoscaler (if autoscaler is defined) # in separate threads in the background. # This should not block the main thread. - self.infra_provider.start_replica_fetcher() + self.infra_provider.start_replica_prober() if self.autoscaler is not None: - self.autoscaler.start_monitor() + self.autoscaler.start() + + # Disable all GET logs if SKYPILOT_DEBUG is not set to avoid overflood + # the control plane logs. + if not env_options.Options.SHOW_DEBUG_INFO.get(): + logging.getLogger('uvicorn.access').addFilter( + SuppressSuccessGetAccessLogsFilter()) logger.info( - f'SkyServe Control Plane started on http://0.0.0.0:{self.port}') - uvicorn.run(self.app, host='0.0.0.0', port=self.port) + f'SkyServe Control Plane started on http://localhost:{self.port}') + uvicorn.run(self.app, host='localhost', port=self.port) if __name__ == '__main__': @@ -120,21 +143,22 @@ def terminate(request: fastapi.Request): _infra_provider = infra_providers.SkyPilotInfraProvider( args.task_yaml, args.service_name, - readiness_path=service_spec.readiness_path, - readiness_timeout=service_spec.readiness_timeout, + readiness_suffix=service_spec.readiness_suffix, + initial_delay_seconds=service_spec.initial_delay_seconds, post_data=service_spec.post_data) # ======= Autoscaler ========= _autoscaler = autoscalers.RequestRateAutoscaler( _infra_provider, frequency=20, - min_nodes=service_spec.min_replica, - max_nodes=service_spec.max_replica, + min_nodes=service_spec.min_replicas, + max_nodes=service_spec.max_replicas, upper_threshold=service_spec.qps_upper_threshold, lower_threshold=service_spec.qps_lower_threshold, cooldown=60, query_interval=60) # ======= ControlPlane ========= - control_plane = ControlPlane(args.port, _infra_provider, _autoscaler) + control_plane = ControlPlane(args.port, args.task_yaml, _infra_provider, + _autoscaler) control_plane.run() diff --git a/sky/serve/examples/http_minimal/http_minimal.yaml b/sky/serve/examples/http_minimal/http_minimal.yaml new file mode 100644 index 00000000000..d3419b25c26 --- /dev/null +++ b/sky/serve/examples/http_minimal/http_minimal.yaml @@ -0,0 +1,10 @@ +service: + port: 9090 + readiness_probe: / + +resources: + cpus: 2+ + +workdir: . + +run: python3 -m http.server 9090 diff --git a/sky/serve/examples/http_minimal/index.html b/sky/serve/examples/http_minimal/index.html new file mode 100644 index 00000000000..6a7649deacc --- /dev/null +++ b/sky/serve/examples/http_minimal/index.html @@ -0,0 +1,11 @@ + + + + + + Hello, SkyServe! + + +

Hello, SkyServe!

+ + diff --git a/sky/serve/examples/http_server/task.yaml b/sky/serve/examples/http_server/task.yaml index 33a29eaa439..965e7a5c39f 100644 --- a/sky/serve/examples/http_server/task.yaml +++ b/sky/serve/examples/http_server/task.yaml @@ -1,7 +1,5 @@ resources: - cloud: gcp - ports: - - 8081 + cpus: 2+ workdir: sky/serve/examples/http_server @@ -11,7 +9,5 @@ service: port: 8081 readiness_probe: path: /health - readiness_timeout: 12000 - replica_policy: - min_replica: 1 - max_replica: 1 + initial_delay_seconds: 20 + replicas: 2 diff --git a/sky/serve/examples/llama2/llama2.yaml b/sky/serve/examples/llama2/llama2.yaml index a1317e33509..e36ecb7936e 100644 --- a/sky/serve/examples/llama2/llama2.yaml +++ b/sky/serve/examples/llama2/llama2.yaml @@ -1,5 +1,4 @@ resources: - cloud: gcp memory: 32+ accelerators: T4:1 disk_size: 1024 @@ -7,11 +6,8 @@ resources: service: port: 8087 - readiness_probe: - path: /v1/models - readiness_timeout: 1200 - replica_policy: - min_replica: 2 + readiness_probe: /v1/models + replicas: 2 envs: MODEL_SIZE: 7 diff --git a/sky/serve/examples/stable_diffusion_service.yaml b/sky/serve/examples/stable_diffusion_service.yaml index bed040b881a..4b4c86b3142 100644 --- a/sky/serve/examples/stable_diffusion_service.yaml +++ b/sky/serve/examples/stable_diffusion_service.yaml @@ -1,16 +1,12 @@ #SkyPilot YAML to run stable diffusion web tool on 1 V100 GPU. resources: - cloud: gcp accelerators: V100:1 service: port: 7860 - readiness_probe: - path: / - readiness_timeout: 1200 - replica_policy: - min_replica: 2 + readiness_probe: / + replicas: 2 file_mounts: /stable_diffusion: examples/stable_diffusion diff --git a/sky/serve/examples/tgi_coder.yaml b/sky/serve/examples/tgi_coder.yaml index cd55cb295ce..b6247490bc5 100644 --- a/sky/serve/examples/tgi_coder.yaml +++ b/sky/serve/examples/tgi_coder.yaml @@ -1,16 +1,10 @@ resources: accelerators: A100:1 - cloud: gcp - image_id: projects/skypilot-375900/global/images/coder - # use_spot: True - -num_nodes: 1 service: port: 8082 readiness_probe: /health + replicas: 2 run: | - volume=/home/gcpuser/sky_workdir/huggingface-vscode-endpoint-server/data/ - model=WizardLM/WizardCoder-15B-V1.0 - docker run --gpus all --shm-size 1g -p 8082:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference --model-id $model \ No newline at end of file + docker run --gpus all --shm-size 1g -p 8082:80 -v ~/data:/data ghcr.io/huggingface/text-generation-inference --model-id WizardLM/WizardCoder-15B-V1.0 diff --git a/sky/serve/examples/api_server.yaml b/sky/serve/examples/vllm.yaml similarity index 52% rename from sky/serve/examples/api_server.yaml rename to sky/serve/examples/vllm.yaml index 6aabf130553..7ae83534a69 100644 --- a/sky/serve/examples/api_server.yaml +++ b/sky/serve/examples/vllm.yaml @@ -1,21 +1,13 @@ resources: accelerators: A100:1 - cloud: gcp - # region: us-central1 - # use_spot: True - image_id: projects/skypilot-375900/global/images/fastchat-serve-v0 - -num_nodes: 1 - -file_mounts: - ~/chatlogs: - name: skypilot-chatbot-logs - store: gcs - mode: MOUNT service: port: 8081 - readiness_probe: /health/v1/models + readiness_probe: + path: /v1/models + # vllm takes 5-10 minutes to install + initial_delay_seconds: 1200 + replicas: 2 setup: | conda activate chatbot @@ -25,32 +17,28 @@ setup: | # Setup the environment conda create -n chatbot python=3.10 -y conda activate chatbot - pip3 install fschat + pip install pip install git+https://github.com/lm-sys/FastChat.git + pip install vllm + pip install accelerate fi run: | conda activate chatbot - python3 -m fastchat.serve.controller --host 0.0.0.0 --port 21001 > ~/controller.log 2>&1 & WORKER_IP=$(hostname -I | cut -d' ' -f1) CONTROLLER_PORT=21001 WORKER_PORT=21002 - # python3 -m fastchat.serve.model_worker \ - # --model-path lmsys/vicuna-7b-v1.3 \ - # --controller-address http://${WORKER_IP}:${CONTROLLER_PORT} \ - # --worker-address http://${WORKER_IP}:${WORKER_PORT} \ - # --host 0.0.0.0 \ - # --port ${WORKER_PORT} > ~/worker.log 2>&1 & + + python3 -m fastchat.serve.controller --host 0.0.0.0 --port ${CONTROLLER_PORT} > ~/controller.log 2>&1 & cd FastChat python3 -m fastchat.serve.vllm_worker \ - --model-path lmsys/vicuna-7b-v1.3 \ + --model-path lmsys/vicuna-7b-v1.5 \ --controller-address http://${WORKER_IP}:${CONTROLLER_PORT} \ --worker-address http://${WORKER_IP}:${WORKER_PORT} \ --host 0.0.0.0 \ --port ${WORKER_PORT} \ --tokenizer hf-internal-testing/llama-tokenizer > ~/worker.log 2>&1 & - HOST_IP=$(hostname -I | cut -d' ' -f1) python3 -m fastchat.serve.openai_api_server --host ${HOST_IP} --port 8081 diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index f727584b881..9c56054aece 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -1,78 +1,213 @@ """InfraProvider: handles the creation and deletion of endpoint replicas.""" -import base64 -import collections from concurrent import futures +import enum import logging -import multiprocessing import os -import pickle +import random import requests import signal +import subprocess import threading import time -from typing import List, Dict, Set, Optional, Any, Union +from typing import List, Dict, Set, Optional, Any, Union, Tuple -import sky from sky import backends +from sky import core +from sky import global_user_state from sky import status_lib -from sky.backends import backend_utils +from sky.serve import serve_utils +from sky.skylet import job_lib +from sky.utils import env_options logger = logging.getLogger(__name__) +_JOB_STATUS_FETCH_INTERVAL = 30 _PROCESS_POOL_REFRESH_INTERVAL = 20 _ENDPOINT_PROBE_INTERVAL = 10 # TODO(tian): Maybe let user determine this threshold -_CONTINUOUS_FAILURE_THRESHOLD = 180 // _ENDPOINT_PROBE_INTERVAL +_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180 +_CONSECUTIVE_FAILURE_THRESHOLD_COUNT = ( + _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT // _ENDPOINT_PROBE_INTERVAL) + + +class ProcessStatus(enum.Enum): + """Process status.""" + + # The process is running + RUNNING = 'RUNNING' + + # The process is finished and success + SUCCESS = 'SUCCESS' + + # The process failed + FAILED = 'FAILED' + + +class ReplicaStatusProperty: + """Some properties that determine replica status.""" + + def __init__(self) -> None: + # Process status of sky.launch + # Initial value is RUNNING since each `ReplicaInfo` is created + # when `sky.launch` is called. + self.sky_launch_status: ProcessStatus = ProcessStatus.RUNNING + # User job status in [FAILED, FAILED_SETUP] + self.user_app_failed: bool = False + # Latest readiness probe result + self.service_ready_now: bool = False + # Whether the service has been ready at least once + # If service was not ready before, we count how long it takes to startup + # and compare it with the initial delay seconds; otherwise, we count how + # many consecutive failures it has. + self.service_once_ready: bool = False + # Process status of sky.down. None means sky.down is not called yet. + self.sky_down_status: Optional[ProcessStatus] = None + + def is_scale_down_no_failure(self) -> bool: + if self.sky_launch_status != ProcessStatus.SUCCESS: + return False + if self.sky_down_status != ProcessStatus.SUCCESS: + return False + if self.user_app_failed: + return False + if not self.service_ready_now: + return False + return self.service_once_ready + + def should_track_status(self) -> bool: + if self.sky_launch_status != ProcessStatus.SUCCESS: + return False + if self.sky_down_status is not None: + return False + if self.user_app_failed: + return False + return True + + def to_replica_status(self) -> status_lib.ReplicaStatus: + if self.sky_launch_status == ProcessStatus.RUNNING: + # Still launching + return status_lib.ReplicaStatus.PROVISIONING + if self.sky_down_status is not None: + if self.sky_down_status == ProcessStatus.RUNNING: + # sky.down is running + return status_lib.ReplicaStatus.SHUTTING_DOWN + if self.sky_down_status == ProcessStatus.FAILED: + # sky.down failed + return status_lib.ReplicaStatus.FAILED_CLEANUP + if self.user_app_failed: + # Failed on user setup/run + return status_lib.ReplicaStatus.FAILED + if not self.service_once_ready: + # initial delay seconds exceeded + return status_lib.ReplicaStatus.FAILED + if not self.service_ready_now: + # Max continuous failure exceeded + return status_lib.ReplicaStatus.FAILED + if self.sky_launch_status == ProcessStatus.FAILED: + # sky.launch failed + return status_lib.ReplicaStatus.FAILED + # This indicate it is a scale_down with correct teardown. + # Should have been cleaned from the replica_info. + return status_lib.ReplicaStatus.UNKNOWN + if self.sky_launch_status == ProcessStatus.FAILED: + # sky.launch failed + # down process should have been started + return status_lib.ReplicaStatus.UNKNOWN + if self.service_ready_now: + # Service is ready + return status_lib.ReplicaStatus.READY + if self.user_app_failed: + # Failed on user setup/run + # down process should have been started + return status_lib.ReplicaStatus.UNKNOWN + if self.service_once_ready: + # Service was ready before but not now + return status_lib.ReplicaStatus.NOT_READY + else: + # No readiness probe passed and sky.launch finished + return status_lib.ReplicaStatus.STARTING + + +class ReplicaInfo: + """Replica info for each replica.""" + + def __init__(self, replica_id: int, cluster_name: str) -> None: + self.replica_id: int = replica_id + self.cluster_name: str = cluster_name + self.first_not_ready_time: Optional[float] = None + self.consecutive_failure_cnt: int = 0 + self.status_property: ReplicaStatusProperty = ReplicaStatusProperty() + + @property + def handle(self) -> Optional[backends.CloudVmRayResourceHandle]: + cluster_record = global_user_state.get_cluster_from_name( + self.cluster_name) + if cluster_record is None: + return None + handle = cluster_record['handle'] + assert isinstance(handle, backends.CloudVmRayResourceHandle) + return handle + + @property + def ip(self) -> Optional[str]: + handle = self.handle + if handle is None: + return None + return handle.head_ip + + @property + def status(self) -> status_lib.ReplicaStatus: + replica_status = self.status_property.to_replica_status() + if replica_status == status_lib.ReplicaStatus.UNKNOWN: + logger.error('Detecting UNKNOWN replica status for cluster ' + f'{self.cluster_name}') + return replica_status + + def to_info_dict(self, with_handle: bool) -> Dict[str, Any]: + info_dict = { + 'replica_id': self.replica_id, + 'name': self.cluster_name, + 'status': self.status, + } + if with_handle: + info_dict['handle'] = self.handle + return info_dict class InfraProvider: - """Each infra provider manages one services.""" + """Each infra provider manages one service.""" def __init__( self, - readiness_path: str, - readiness_timeout: int, + readiness_suffix: str, + initial_delay_seconds: int, post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: - self.ready_replicas: Set[str] = set() - self.unhealthy_replicas: Set[str] = set() - self.failed_replicas: Set[str] = set() - self.first_unhealthy_time: Dict[str, float] = dict() - self.continuous_failure: Dict[str, int] = collections.defaultdict(int) - self.readiness_path: str = readiness_path - self.readiness_timeout: int = readiness_timeout + # TODO(tian): make this thread safe + self.replica_info: Dict[str, ReplicaInfo] = dict() + self.readiness_suffix: str = readiness_suffix + self.initial_delay_seconds: int = initial_delay_seconds self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data - logger.info(f'Readiness probe path: {self.readiness_path}') + self.uptime: Optional[float] = None + logger.info(f'Readiness probe suffix: {self.readiness_suffix}') + logger.info(f'Initial delay seconds: {self.initial_delay_seconds}') logger.info(f'Post data: {self.post_data} ({type(self.post_data)})') - def get_replica_info(self) -> List[Dict[str, str]]: + def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: # Get replica info for all replicas raise NotImplementedError - def _get_replica_ips(self) -> Set[str]: - # Get all replica ips - raise NotImplementedError + def get_uptime(self) -> Optional[float]: + return self.uptime def total_replica_num(self) -> int: # Returns the total number of replicas, including those under # provisioning and deletion raise NotImplementedError - def ready_replica_num(self) -> int: - # Returns the total number of available replicas - raise NotImplementedError - def get_ready_replicas(self) -> Set[str]: # Returns the endpoints of all ready replicas raise NotImplementedError - def unhealthy_replica_num(self) -> int: - # Returns the total number of unhealthy replicas - raise NotImplementedError - - def failed_replica_num(self) -> int: - # Returns the number of failed replicas - raise NotImplementedError - def scale_up(self, n: int) -> None: raise NotImplementedError @@ -81,285 +216,316 @@ def scale_down(self, n: int) -> None: # delete or the number of replicas to delete raise NotImplementedError - def _terminate_replicas(self, unhealthy_replicas: Set[str]) -> None: - # Terminates the replicas with endpoints in the list - raise NotImplementedError - def terminate(self) -> Optional[str]: # Terminate service raise NotImplementedError - def start_replica_fetcher(self) -> None: + def start_replica_prober(self) -> None: # Start the replica fetcher thread raise NotImplementedError - def terminate_replica_fetcher(self) -> None: - # Terminate the replica fetcher thread - raise NotImplementedError - - def probe_all_endpoints(self) -> None: - # Probe readiness of all endpoints - raise NotImplementedError - class SkyPilotInfraProvider(InfraProvider): """Infra provider for SkyPilot clusters.""" - def __init__(self, task_yaml_path: str, cluster_name_prefix: str, *args, + def __init__(self, task_yaml_path: str, service_name: str, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.task_yaml_path: str = task_yaml_path - self.cluster_name_prefix: str = cluster_name_prefix + '-' - self.id_counter: int = 1 - self.launch_process_pool: Dict[str, multiprocessing.Process] = dict() - self.down_process_pool: Dict[str, multiprocessing.Process] = dict() + self.service_name: str = service_name + self.next_replica_id: int = 1 + self.launch_process_pool: Dict[str, subprocess.Popen] = dict() + self.down_process_pool: Dict[str, subprocess.Popen] = dict() - self._start_refresh_process_pool() + self._start_process_pool_refresher() + self._start_job_status_fetcher() + # This process periodically checks all sky.launch and sky.down process + # on the fly. If any of them finished, it will update the status of + # the corresponding replica. def _refresh_process_pool(self) -> None: - while not self.refresh_process_pool_stop_event.is_set(): - logger.info('Refreshing process pool.') - for op, pool in zip( - ['Launch', 'Down'], - [self.launch_process_pool, self.down_process_pool]): - for cluster_name, p in list(pool.items()): - if not p.is_alive(): - # TODO(tian): Try-catch in process, and have an enum - # return value to indicate which type of failure - # happened. Currently we only have user code failure - # since the retry_until_up flag is set to True, but it - # will be helpful when we enable user choose whether to - # retry or not. - logger.info( - f'{op} process for {cluster_name} finished.') - del pool[cluster_name] - if p.exitcode != 0: - logger.info( - f'{op} process for {cluster_name} exited ' - f'abnormally with code {p.exitcode}.') - self.failed_replicas.add(cluster_name) - time.sleep(_PROCESS_POOL_REFRESH_INTERVAL) + for cluster_name, p in list(self.launch_process_pool.items()): + if p.poll() is not None: + # TODO(tian): Try-catch in process, and have an enum return + # value to indicate which type of failure happened. + # Currently we only have user code failure since the + # retry_until_up flag is set to True, but it will be helpful + # when we enable user choose whether to retry or not. + logger.info(f'Launch process for {cluster_name} finished.') + del self.launch_process_pool[cluster_name] + info = self.replica_info[cluster_name] + if p.returncode != 0: + logger.warning(f'Launch process for {cluster_name} exited ' + f'abnormally with code {p.returncode}. ' + 'Terminating...') + info.status_property.sky_launch_status = ( + ProcessStatus.FAILED) + self._teardown_cluster(cluster_name) + else: + info.status_property.sky_launch_status = ( + ProcessStatus.SUCCESS) + for cluster_name, p in list(self.down_process_pool.items()): + if p.poll() is not None: + logger.info(f'Down process for {cluster_name} finished.') + del self.down_process_pool[cluster_name] + info = self.replica_info[cluster_name] + if p.returncode != 0: + logger.error(f'Down process for {cluster_name} exited ' + f'abnormally with code {p.returncode}.') + info.status_property.sky_down_status = ( + ProcessStatus.FAILED) + else: + info.status_property.sky_down_status = ( + ProcessStatus.SUCCESS) + # Failed replica still count as a replica. In our current + # design, we want to fail early if user code have any error. + # This will prevent infinite loop of teardown and + # re-provision. + if info.status_property.is_scale_down_no_failure(): + # This means the cluster is deleted due to + # a scale down. Delete the replica info + # so it won't count as a replica. + del self.replica_info[cluster_name] + logger.info(f'Cluster {cluster_name} removed from the ' + 'replica info normally.') + else: + logger.info(f'Termination of cluster {cluster_name} ' + 'finished. Replica info is kept since some ' + 'failure detected.') - def _start_refresh_process_pool(self) -> None: - self.refresh_process_pool_stop_event = threading.Event() - self.refresh_process_pool_thread = threading.Thread( - target=self._refresh_process_pool) - self.refresh_process_pool_thread.start() - - def _terminate_refresh_process_pool(self) -> None: - self.refresh_process_pool_stop_event.set() - self.refresh_process_pool_thread.join() - - def _get_ip_clusname_map(self) -> Dict[str, str]: - """ - Returns a map of ip to cluster name for all clusters. - """ - clusters = sky.global_user_state.get_clusters() - ip_clusname_map = {} - dummy_counter = 0 - for cluster in clusters: - name = cluster['name'] - handle = cluster['handle'] + # TODO(tian): Maybe use decorator? + def _process_pool_refresher(self) -> None: + while not self.process_pool_refresher_stop_event.is_set(): + logger.info('Refreshing process pool.') try: - # Get the head node ip - ip = backend_utils.get_node_ips(handle.cluster_yaml, - handle.launched_nodes, - handle)[0] - ip_clusname_map[ip] = name - except sky.exceptions.FetchIPError: - logger.warning(f'Unable to get IP for cluster {name}.' - 'Use dummp IP instead.') - ip_clusname_map[f'10.0.0.{dummy_counter}'] = name - dummy_counter += 1 - continue - return ip_clusname_map + self._refresh_process_pool() + except Exception as e: # pylint: disable=broad-except + # No matter what error happens, we should keep the + # process pool refresher running. + logger.error(f'Error in process pool refresher: {e}') + time.sleep(_PROCESS_POOL_REFRESH_INTERVAL) - def get_replica_info(self) -> List[Dict[str, str]]: + def _start_process_pool_refresher(self) -> None: + self.process_pool_refresher_stop_event = threading.Event() + self.process_pool_refresher_thread = threading.Thread( + target=self._process_pool_refresher) + self.process_pool_refresher_thread.start() - def _get_replica_status(cluster_status: status_lib.ClusterStatus, - ip: str) -> status_lib.ReplicaStatus: - if ip in self.ready_replicas: - return status_lib.ReplicaStatus.READY - if ip in self.failed_replicas: - return status_lib.ReplicaStatus.FAILED - if cluster_status == status_lib.ClusterStatus.UP: - return status_lib.ReplicaStatus.UNHEALTHY - return status_lib.ReplicaStatus.INIT - - # TODO(tian): Return failed replica info here if it is already - # be torn down. - clusters = sky.global_user_state.get_clusters() - infos = [] - for cluster in clusters: - handle = cluster['handle'] - assert isinstance(handle, backends.CloudVmRayResourceHandle) - ip = handle.head_ip - info = { - 'name': cluster['name'], - 'handle': handle, - 'status': _get_replica_status(cluster['status'], ip), - } - info = { - k: base64.b64encode(pickle.dumps(v)).decode('utf-8') - for k, v in info.items() - } - infos.append(info) - return infos - - def _get_replica_ips(self) -> Set[str]: - ips = set(self._get_ip_clusname_map().keys()) - logger.info(f'Returning SkyPilot endpoints: {ips}') - return ips + def _fetch_job_status(self) -> None: + for cluster_name, info in self.replica_info.items(): + if not info.status_property.should_track_status(): + continue + # Only fetch job 1, which stands for user task job + job_statuses = core.job_status(cluster_name, [1]) + job_status = job_statuses['1'] + if job_status in [ + job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP + ]: + info.status_property.user_app_failed = True + logger.info(f'User APP for cluster {cluster_name} FAILED. ' + 'Terminating...') + self._teardown_cluster(cluster_name) + + def _job_status_fetcher(self) -> None: + while not self.job_status_fetcher_stop_event.is_set(): + logger.info('Refreshing job status.') + try: + self._fetch_job_status() + except Exception as e: # pylint: disable=broad-except + # No matter what error happens, we should keep the + # job status fetcher running. + logger.error(f'Error in job status fetcher: {e}') + time.sleep(_JOB_STATUS_FETCH_INTERVAL) + + def _start_job_status_fetcher(self) -> None: + self.job_status_fetcher_stop_event = threading.Event() + self.job_status_fetcher_thread = threading.Thread( + target=self._job_status_fetcher) + self.job_status_fetcher_thread.start() + + def _terminate_daemon_threads(self) -> None: + self.replica_prober_stop_event.set() + self.job_status_fetcher_stop_event.set() + self.process_pool_refresher_stop_event.set() + self.replica_prober_thread.join() + self.job_status_fetcher_thread.join() + self.process_pool_refresher_thread.join() + + def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: + return [ + info.to_info_dict(with_handle=verbose) + for info in self.replica_info.values() + ] def total_replica_num(self) -> int: - clusters = sky.global_user_state.get_clusters() - # All replica launched in controller is a replica. - return len(clusters) + return len(self.replica_info) def get_ready_replicas(self) -> Set[str]: - return self.ready_replicas - - def ready_replica_num(self) -> int: - return len(self.ready_replicas) - - def unhealthy_replica_num(self) -> int: - return len(self.unhealthy_replicas) - - def failed_replica_num(self) -> int: - return len(self.failed_replicas) - - def _launch_cluster(self, cluster_name: str, task: sky.Task) -> None: - p = multiprocessing.Process(target=sky.launch, - args=(task,), - kwargs={ - 'cluster_name': cluster_name, - 'detach_run': True, - 'retry_until_up': True - }) + ready_replicas = set() + for info in self.replica_info.values(): + if info.status == status_lib.ReplicaStatus.READY: + assert info.ip is not None + ready_replicas.add(info.ip) + return ready_replicas + + def _launch_cluster(self, replica_id: int) -> None: + cluster_name = serve_utils.generate_replica_cluster_name( + self.service_name, replica_id) + if cluster_name in self.launch_process_pool: + logger.warning(f'Launch process for cluster {cluster_name} ' + 'already exists. Skipping.') + return + logger.info(f'Creating SkyPilot cluster {cluster_name}') + cmd = ['sky', 'launch', self.task_yaml_path, '-c', cluster_name, '-y'] + cmd.extend(['--detach-setup', '--detach-run', '--retry-until-up']) + fn = serve_utils.generate_replica_launch_log_file_name(cluster_name) + with open(fn, 'w') as f: + # pylint: disable=consider-using-with + p = subprocess.Popen(cmd, + stdin=subprocess.DEVNULL, + stdout=f, + stderr=f) self.launch_process_pool[cluster_name] = p - p.start() + assert cluster_name not in self.replica_info + self.replica_info[cluster_name] = ReplicaInfo(replica_id, cluster_name) def _scale_up(self, n: int) -> None: # Launch n new clusters - task = sky.Task.from_yaml(self.task_yaml_path) for _ in range(0, n): - cluster_name = f'{self.cluster_name_prefix}{self.id_counter}' - logger.info(f'Creating SkyPilot cluster {cluster_name}') - self._launch_cluster(cluster_name, task) - self.id_counter += 1 + self._launch_cluster(self.next_replica_id) + self.next_replica_id += 1 def scale_up(self, n: int) -> None: self._scale_up(n) - def _teardown_cluster(self, cluster_name: str) -> None: - p = multiprocessing.Process(target=sky.down, - args=(cluster_name,), - kwargs={'purge': True}) + def _teardown_cluster(self, + cluster_name: str, + sync_down_logs: bool = True) -> None: + if cluster_name in self.down_process_pool: + logger.warning(f'Down process for cluster {cluster_name} already ' + 'exists. Skipping.') + return + + if sync_down_logs: + logger.info(f'Syncing down logs for cluster {cluster_name}...') + replica_id = serve_utils.get_replica_id_from_cluster_name( + cluster_name) + code = serve_utils.ServeCodeGen.stream_logs( + self.service_name, + replica_id, + follow=False, + skip_local_log_file_check=True) + local_log_file_name = ( + serve_utils.generate_replica_local_log_file_name(cluster_name)) + with open(local_log_file_name, 'w') as f: + subprocess.run(code, shell=True, check=True, stdout=f) + + logger.info(f'Deleting SkyPilot cluster {cluster_name}') + cmd = ['sky', 'down', cluster_name, '-y'] + fn = serve_utils.generate_replica_down_log_file_name(cluster_name) + with open(fn, 'w') as f: + # pylint: disable=consider-using-with + p = subprocess.Popen(cmd, + stdin=subprocess.DEVNULL, + stdout=f, + stderr=f) self.down_process_pool[cluster_name] = p - p.start() + info = self.replica_info[cluster_name] + info.status_property.sky_down_status = ProcessStatus.RUNNING def _scale_down(self, n: int) -> None: - # Delete n clusters - # Currently deletes the first n clusters - clusters = sky.global_user_state.get_clusters() - num_clusters = len(clusters) - if num_clusters > 0: - if n > num_clusters: + # Randomly delete n ready replicas + all_ready_replicas = self.get_ready_replicas() + num_replicas = len(all_ready_replicas) + if num_replicas > 0: + if n > num_replicas: logger.warning( - f'Trying to delete {n} clusters, but only {num_clusters} ' - 'clusters exist. Deleting all clusters.') - n = num_clusters - for i in range(0, n): - cluster = clusters[i] - logger.info(f'Deleting SkyPilot cluster {cluster["name"]}') - self._teardown_cluster(cluster['name']) + f'Trying to delete {n} replicas, but only {num_replicas} ' + 'replicas exist. Deleting all replicas.') + n = num_replicas + cluster_to_terminate = random.sample(all_ready_replicas, n) + for cluster_name in cluster_to_terminate: + logger.info(f'Scaling down cluster {cluster_name}') + self._teardown_cluster(cluster_name) def scale_down(self, n: int) -> None: self._scale_down(n) - def _terminate_replicas(self, unhealthy_replicas: Set[str]) -> None: - # Remove unhealthy replicas from current_endpoints - logger.info('SkyPilotInfraProvider._terminate_replicas called with ' - f'unhealthy_replicas={unhealthy_replicas}') - for endpoint_url in unhealthy_replicas: - ip_to_name_map = self._get_ip_clusname_map() - if endpoint_url not in ip_to_name_map: - logger.warning( - f'Unable to find cluster name for endpoint {endpoint_url}. ' - 'Skipping.') - continue - name = ip_to_name_map[endpoint_url] - if endpoint_url in unhealthy_replicas: - logger.info(f'Deleting SkyPilot cluster {name}') - self._teardown_cluster(name) - def terminate(self) -> Optional[str]: - # For correctly show serve status - self.ready_replicas.clear() - self.unhealthy_replicas = self._get_replica_ips() - self._terminate_refresh_process_pool() + logger.info('Terminating infra provider daemon threads...') + self._terminate_daemon_threads() + logger.info('Terminating all clusters...') for name, p in self.launch_process_pool.items(): # Use keyboard interrupt here since sky.launch has great # handling for it # Edge case: sky.launched finished after the - # process_pool_refresh_process terminates - if p.is_alive(): + # process_pool_refresher terminates + if p.poll() is None: assert p.pid is not None - os.kill(p.pid, signal.SIGINT) - p.join() - self._teardown_cluster(name) - logger.info(f'Interrupted launch process for cluster {name}' + os.killpg(os.getpgid(p.pid), signal.SIGINT) + p.wait() + logger.info(f'Interrupted launch process for cluster {name} ' 'and deleted the cluster.') - replica_ips = self._get_replica_ips() - self._terminate_replicas(replica_ips) + self._teardown_cluster(name, sync_down_logs=False) + info = self.replica_info[name] + # Set to success here for correctly display as shutting down + info.status_property.sky_launch_status = ProcessStatus.SUCCESS + for name, info in self.replica_info.items(): + # Skip those already deleted and those are deleting + if info.status not in [ + status_lib.ReplicaStatus.FAILED, + status_lib.ReplicaStatus.SHUTTING_DOWN + ]: + self._teardown_cluster(name, sync_down_logs=False) msg = [] for name, p in self.down_process_pool.items(): - p.join() + p.wait() logger.info(f'Down process for cluster {name} finished.') - if p.exitcode != 0: + if p.returncode != 0: + logger.warning(f'Down process for cluster {name} exited ' + f'abnormally with code {p.returncode}.') msg.append(f'Down process for cluster {name} exited abnormally' - f' with code {p.exitcode}. Please login to the ' + f' with code {p.returncode}. Please login to the ' 'controller and make sure the cluster is released.') if not msg: return None return '\n'.join(msg) - def _replica_fetcher(self) -> None: - while not self.replica_fetcher_stop_event.is_set(): - logger.info('Running replica fetcher.') + def _replica_prober(self) -> None: + while not self.replica_prober_stop_event.is_set(): + logger.info('Running replica prober.') try: - self.probe_all_endpoints() + self._probe_all_replicas() except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the - # replica fetcher running. - logger.error(f'Error in replica fetcher: {e}') + # replica prober running. + logger.error(f'Error in replica prober: {e}') time.sleep(_ENDPOINT_PROBE_INTERVAL) - def start_replica_fetcher(self) -> None: - self.replica_fetcher_stop_event = threading.Event() - self.replica_fetcher_thread = threading.Thread( - target=self._replica_fetcher) - self.replica_fetcher_thread.start() + def start_replica_prober(self) -> None: + self.replica_prober_stop_event = threading.Event() + self.replica_prober_thread = threading.Thread( + target=self._replica_prober) + self.replica_prober_thread.start() - def terminate_replica_fetcher(self) -> None: - self.replica_fetcher_stop_event.set() - self.replica_fetcher_thread.join() + def _probe_all_replicas(self) -> None: + replica_info = self.get_replica_info( + verbose=env_options.Options.SHOW_DEBUG_INFO.get()) + logger.info(f'All replica info: {replica_info}') - def probe_all_endpoints(self) -> None: - replica_ips = self._get_replica_ips() - self.failed_replicas - - def probe_endpoint(replica_ip: str) -> Optional[str]: + def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: + replica_ip = info.ip try: msg = '' - readiness_url = f'http://{replica_ip}{self.readiness_path}' + readiness_suffix = f'http://{replica_ip}{self.readiness_suffix}' if self.post_data is not None: msg += 'Post' - response = requests.post(readiness_url, + response = requests.post(readiness_suffix, json=self.post_data, timeout=3) else: msg += 'Get' - response = requests.get(readiness_url, timeout=3) + response = requests.get(readiness_suffix, timeout=3) msg += (f' request to {replica_ip} returned status code ' f'{response.status_code}') if response.status_code == 200: @@ -368,51 +534,65 @@ def probe_endpoint(replica_ip: str) -> Optional[str]: msg += f' and response {response.text}.' logger.info(msg) if response.status_code == 200: - logger.info(f'Replica {replica_ip} is available.') - return replica_ip + logger.info(f'Replica {replica_ip} is ready.') + if self.uptime is None: + self.uptime = time.time() + logger.info(f'Replica {replica_ip} is the first ' + 'ready replica. Setting uptime to ' + f'{self.uptime}.') + return info.cluster_name, True except requests.exceptions.RequestException as e: logger.info(e) - logger.info(f'Replica {replica_ip} is not available.') + logger.info(f'Replica {replica_ip} is not ready.') pass - return None + return info.cluster_name, False + probe_futures = [] + replica_to_probe = [] with futures.ThreadPoolExecutor() as executor: - probe_futures = [ - executor.submit(probe_endpoint, replica_ip) - for replica_ip in replica_ips - ] - ready_replicas = set() - for future in futures.as_completed(probe_futures): - ip = future.result() - if ip is not None: - ready_replicas.add(ip) - - logger.info(f'Ready replicas: {ready_replicas}') - self.ready_replicas = ready_replicas - unhealthy_replicas = replica_ips - ready_replicas - logger.info(f'Unhealthy replicas: {unhealthy_replicas}') - self.unhealthy_replicas = unhealthy_replicas - - for replica in ready_replicas: - self.continuous_failure[replica] = 0 - - replicas_to_terminate = set() - for replica in unhealthy_replicas: - if replica not in self.first_unhealthy_time: - self.first_unhealthy_time[replica] = time.time() - self.continuous_failure[replica] += 1 - # coldstart time limitation is `self.readiness_timeout`. - first_unhealthy_time = self.first_unhealthy_time[replica] - if time.time() - first_unhealthy_time > self.readiness_timeout: - continuous_failure = self.continuous_failure[replica] - if continuous_failure > _CONTINUOUS_FAILURE_THRESHOLD: - logger.info(f'Terminating replica {replica}.') - replicas_to_terminate.add(replica) + for cluster_name, info in self.replica_info.items(): + if not info.status_property.should_track_status(): + continue + replica_to_probe.append((info.cluster_name, info.ip)) + probe_futures.append(executor.submit(_probe_replica, info)) + logger.info(f'Replicas to probe: {replica_to_probe}') + + for future in futures.as_completed(probe_futures): + cluster_name, res = future.result() + info = self.replica_info[cluster_name] + info.status_property.service_ready_now = res + if res: + if not info.status_property.service_once_ready: + info.status_property.service_once_ready = True + continue + if info.first_not_ready_time is None: + info.first_not_ready_time = time.time() + if info.status_property.service_once_ready: + info.consecutive_failure_cnt += 1 + if (info.consecutive_failure_cnt >= + _CONSECUTIVE_FAILURE_THRESHOLD_COUNT): + logger.info(f'Replica {cluster_name} is not ready for too ' + 'long and exceeding consecutive failure ' + 'threshold. Terminating the replica...') + self._teardown_cluster(cluster_name) else: - logger.info(f'Replica {replica} is unhealthy but ' - 'within unhealthy threshold. Skipping.') + current_unready_time = (info.consecutive_failure_cnt * + _ENDPOINT_PROBE_INTERVAL) + logger.info(f'Replica {cluster_name} is not ready but ' + 'within consecutive failure threshold ' + f'({current_unready_time}s / ' + f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' + 'Skipping.') else: - logger.info(f'Replica {replica} is unhealthy but within ' - 'readiness timeout. Skipping.') - - self._terminate_replicas(replicas_to_terminate) + current_delay_seconds = time.time() - info.first_not_ready_time + if current_delay_seconds > self.initial_delay_seconds: + logger.info(f'Replica {cluster_name} is not ready and ' + 'exceeding initial delay seconds. ' + 'Terminating the replica...') + self._teardown_cluster(cluster_name) + else: + current_delay_seconds = int(current_delay_seconds) + logger.info( + f'Replica {cluster_name} is not ready but within ' + f'initial delay seconds ({current_delay_seconds}s / ' + f'{self.initial_delay_seconds}s). Skipping.') diff --git a/sky/serve/load_balancers.py b/sky/serve/load_balancers.py index c04a3986d0b..3b831ca4a3e 100644 --- a/sky/serve/load_balancers.py +++ b/sky/serve/load_balancers.py @@ -65,5 +65,11 @@ def select_replica(self, request: fastapi.Request) -> Optional[str]: return None replica_ip = self.replicas_queue.popleft() self.replicas_queue.append(replica_ip) - logger.info(f'Selected replica {replica_ip} for request {request}') + request_repr = ('') + logger.info(f'Selected replica {replica_ip} for request {request_repr}') return replica_ip diff --git a/sky/serve/redirector.py b/sky/serve/redirector.py index e1507cef8e7..7af4b6ce6b6 100644 --- a/sky/serve/redirector.py +++ b/sky/serve/redirector.py @@ -53,7 +53,7 @@ def _sync_with_control_plane(self): # send request num in last query interval response = session.post( self.control_plane_url + - '/control_plane/get_num_requests', + '/control_plane/update_num_requests', json={ 'num_requests': self.load_balancer.deprecate_old_requests() @@ -78,7 +78,9 @@ async def _redirector_handler(self, request: fastapi.Request): if replica_ip is None: raise fastapi.HTTPException(status_code=503, - detail='No available replicas') + detail='No available replicas. ' + 'Use "sky serve status [SERVICE_ID]" ' + 'to check the replica status.') path = f'http://{replica_ip}:{self.port}{request.url.path}' logger.info(f'Redirecting request to {path}') diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py new file mode 100644 index 00000000000..eb8a4c71c05 --- /dev/null +++ b/sky/serve/serve_utils.py @@ -0,0 +1,254 @@ +"""User interface with the SkyServe.""" +import base64 +import colorama +import os +import pickle +import re +import requests +import shlex +import time +from typing import Any, Dict, List, Optional, Iterator, TextIO, Callable + +from sky import backends +from sky import global_user_state +from sky.serve import constants +from sky import status_lib +from sky.utils import common_utils + +_CONTROL_PLANE_URL = f'http://localhost:{constants.CONTROL_PLANE_PORT}' +_SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' +_SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*' +_FAILED_TO_FIND_REPLICA_MSG = ( + f'{colorama.Fore.RED}Failed to find replica ' + '{replica_id}. Please use `sky serve status [SERVICE_ID]`' + f' to check all valid replica id.{colorama.Style.RESET_ALL}') + + +def generate_replica_cluster_name(service_name: str, replica_id: int) -> str: + return f'{service_name}-{replica_id}' + + +def get_replica_id_from_cluster_name(cluster_name: str) -> int: + return int(cluster_name.split('-')[-1]) + + +def generate_replica_launch_log_file_name(cluster_name: str) -> str: + cluster_name = cluster_name.replace('-', '_') + prefix = os.path.expanduser(constants.SERVICE_YAML_PREFIX) + return f'{prefix}/{cluster_name}_launch.log' + + +def generate_replica_down_log_file_name(cluster_name: str) -> str: + cluster_name = cluster_name.replace('-', '_') + prefix = os.path.expanduser(constants.SERVICE_YAML_PREFIX) + return f'{prefix}/{cluster_name}_down.log' + + +def generate_replica_local_log_file_name(cluster_name: str) -> str: + cluster_name = cluster_name.replace('-', '_') + prefix = os.path.expanduser(constants.SERVICE_YAML_PREFIX) + return f'{prefix}/{cluster_name}_local.log' + + +def get_latest_info() -> str: + resp = requests.get(_CONTROL_PLANE_URL + '/control_plane/get_latest_info') + if resp.status_code != 200: + raise ValueError(f'Failed to get replica info: {resp.text}') + return common_utils.encode_payload(resp.json()) + + +def load_latest_info(payload: str) -> Dict[str, Any]: + latest_info = common_utils.decode_payload(payload) + latest_info = { + k: pickle.loads(base64.b64decode(v)) for k, v in latest_info.items() + } + return latest_info + + +def terminate_service() -> str: + resp = requests.post(_CONTROL_PLANE_URL + '/control_plane/terminate') + resp = base64.b64encode(pickle.dumps(resp)).decode('utf-8') + return common_utils.encode_payload(resp) + + +def load_terminate_service_result(payload: str) -> Any: + terminate_resp = common_utils.decode_payload(payload) + terminate_resp = pickle.loads(base64.b64decode(terminate_resp)) + return terminate_resp + + +def _follow_logs(file: TextIO, + cluster_name: str, + *, + finish_stream: Callable[[], bool], + no_new_content_timeout: Optional[int] = None) -> Iterator[str]: + line = '' + log_file = None + no_new_content_cnt = 0 + + def cluster_is_up() -> bool: + cluster_record = global_user_state.get_cluster_from_name(cluster_name) + if cluster_record is None: + return False + return cluster_record['status'] == status_lib.ClusterStatus.UP + + while True: + tmp = file.readline() + if tmp is not None and tmp != '': + no_new_content_cnt = 0 + line += tmp + if '\n' in line or '\r' in line: + # Tailing detailed progress for user. All logs in skypilot is + # of format `To view detailed progress: tail -n100 -f *.log`. + x = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line) + if x is not None: + log_file = os.path.expanduser(x.group(1)) + elif re.match(_SKYPILOT_LOG_PATTERN, line) is None: + # Not print other logs (file sync logs) since we lack + # utility to determine when these log files are finished + # writing. + # TODO(tian): Not skip these logs since there are small + # chance that error will happen in file sync. Need to find + # a better way to do this. + yield line + # Output next line first since it indicates the process is + # starting. For our launching logs, it's always: + # Launching on () + if log_file is not None: + with open(log_file, 'r', newline='') as f: + # We still exit if more than 10 seconds without new + # content to avoid any internal bug that causes + # the launch failed and cluster status remains INIT. + for l in _follow_logs(f, + cluster_name, + finish_stream=cluster_is_up, + no_new_content_timeout=10): + yield l + log_file = None + line = '' + else: + if finish_stream(): + break + if no_new_content_timeout is not None: + if no_new_content_cnt >= no_new_content_timeout: + break + no_new_content_cnt += 1 + time.sleep(1) + + +def stream_logs(service_name: str, + replica_id: int, + follow: bool, + skip_local_log_file_check: bool = False) -> str: + print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process ' + f'of replica {replica_id}.{colorama.Style.RESET_ALL}') + replica_cluster_name = generate_replica_cluster_name( + service_name, replica_id) + local_log_file_name = generate_replica_local_log_file_name( + replica_cluster_name) + + if not skip_local_log_file_check and os.path.exists(local_log_file_name): + # When sync down, we set skip_local_log_file_check to False so it won't + # detect the just created local log file. Otherwise, it indicates the + # replica is already been terminated. All logs should be in the local + # log file and we don't need to stream logs for it. + with open(local_log_file_name, 'r') as f: + print(f.read(), flush=True) + return '' + + handle = global_user_state.get_handle_from_cluster_name( + replica_cluster_name) + if handle is None: + return _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id) + assert isinstance(handle, backends.CloudVmRayResourceHandle), handle + + launch_log_file_name = generate_replica_launch_log_file_name( + replica_cluster_name) + if not os.path.exists(launch_log_file_name): + return (f'{colorama.Fore.RED}Replica {replica_id} doesn\'t exist.' + f'{colorama.Style.RESET_ALL}') + + def _get_replica_status() -> status_lib.ReplicaStatus: + resp = requests.get(_CONTROL_PLANE_URL + + '/control_plane/get_latest_info') + if resp.status_code != 200: + raise ValueError( + f'{colorama.Fore.RED}Failed to get replica info for service ' + f'{service_name}.{colorama.Style.RESET_ALL}') + replica_info = resp.json()['replica_info'] + replica_info = pickle.loads(base64.b64decode(replica_info)) + target_info: Optional[Dict[str, Any]] = None + for info in replica_info: + if info['replica_id'] == replica_id: + target_info = info + break + if target_info is None: + raise ValueError( + _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)) + return target_info['status'] + + finish_stream = (lambda: not follow or _get_replica_status() != status_lib. + ReplicaStatus.PROVISIONING) + with open(launch_log_file_name, 'r', newline='') as f: + for line in _follow_logs(f, + replica_cluster_name, + finish_stream=finish_stream): + print(line, end='', flush=True) + if not follow and _get_replica_status( + ) == status_lib.ReplicaStatus.PROVISIONING: + # Early exit if not following the logs. + return '' + + backend = backends.CloudVmRayBackend() + # Always tail the logs of the first job, which represent user setup & run. + returncode = backend.tail_logs(handle, job_id=1, follow=follow) + if returncode != 0: + return (f'{colorama.Fore.RED}Failed to stream logs for replica ' + f'{replica_id}.{colorama.Style.RESET_ALL}') + return '' + + +class ServeCodeGen: + """Code generator for SkyServe. + + Usage: + >> code = ServeCodeGen.get_latest_info() + """ + _PREFIX = [ + 'from sky.serve import serve_utils', + ] + + @classmethod + def get_latest_info(cls) -> str: + code = [ + 'msg = serve_utils.get_latest_info()', + 'print(msg, end="", flush=True)' + ] + return cls._build(code) + + @classmethod + def terminate_service(cls) -> str: + code = [ + 'msg = serve_utils.terminate_service()', + 'print(msg, end="", flush=True)' + ] + return cls._build(code) + + @classmethod + def stream_logs(cls, + service_name: str, + replica_id: int, + follow: bool, + skip_local_log_file_check: bool = False) -> str: + code = [ + f'msg = serve_utils.stream_logs({service_name!r}, {replica_id!r}, ' + f'follow={follow}, skip_local_log_file_check=' + f'{skip_local_log_file_check})', 'print(msg, flush=True)' + ] + return cls._build(code) + + @classmethod + def _build(cls, code: List[str]) -> str: + code = cls._PREFIX + code + generated_code = '; '.join(code) + return f'python3 -u -c {shlex.quote(generated_code)}' diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 133e038f366..f05edfceb6b 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -1,6 +1,8 @@ """Service specification for SkyServe.""" import os +import json import yaml +import textwrap from typing import Optional, Dict, Any from sky.backends import backend_utils @@ -15,34 +17,48 @@ class SkyServiceSpec: def __init__( self, readiness_path: str, - readiness_timeout: int, + initial_delay_seconds: int, app_port: int, - min_replica: int, - max_replica: Optional[int] = None, + min_replicas: int, + max_replicas: Optional[int] = None, qps_upper_threshold: Optional[float] = None, qps_lower_threshold: Optional[float] = None, post_data: Optional[Dict[str, Any]] = None, + controller_resources: Optional[Dict[str, Any]] = None, ): - if max_replica is not None and max_replica < min_replica: + if min_replicas < 0: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'max_replica must be greater than or equal to min_replica') + 'min_replicas must be greater than or equal to 0') + if max_replicas is not None and max_replicas < min_replicas: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'max_replicas must be greater than or equal to min_replicas' + ) if app_port == constants.CONTROL_PLANE_PORT: with ux_utils.print_exception_no_traceback(): raise ValueError( f'App port cannot be {constants.CONTROL_PLANE_PORT} ' 'since it is reserved for the control plane. ' ' Please use a different port.') - # TODO: check if the path is valid - self._readiness_path = f':{app_port}{readiness_path}' - self._readiness_timeout = readiness_timeout - # TODO: check if the port is valid + if not readiness_path.startswith('/'): + with ux_utils.print_exception_no_traceback(): + raise ValueError('readiness_path must start with a slash (/). ' + f'Got: {readiness_path}') + self._readiness_path = readiness_path + self._initial_delay_seconds = initial_delay_seconds + if app_port < 0 or app_port > 65535: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Invalid app port: {app_port}. ' + 'Please use a port number between 0 and 65535.') self._app_port = str(app_port) - self._min_replica = min_replica - self._max_replica = max_replica + self._min_replicas = min_replicas + self._max_replicas = max_replicas self._qps_upper_threshold = qps_upper_threshold self._qps_lower_threshold = qps_lower_threshold self._post_data = post_data + self._controller_resources = controller_resources @staticmethod def from_yaml_config(config: Optional[Dict[str, Any]]): @@ -51,21 +67,62 @@ def from_yaml_config(config: Optional[Dict[str, Any]]): backend_utils.validate_schema(config, schemas.get_service_schema(), 'Invalid service YAML:') + if 'replicas' in config and 'replica_policy' in config: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Cannot specify both `replicas` and `replica_policy` in ' + 'the service YAML. Please use one of them.') service_config = {} - service_config['readiness_path'] = config['readiness_probe']['path'] - service_config['readiness_timeout'] = config['readiness_probe'][ - 'readiness_timeout'] service_config['app_port'] = config['port'] - service_config['min_replica'] = config['replica_policy']['min_replica'] - service_config['max_replica'] = config['replica_policy'].get( - 'max_replica', None) - service_config['qps_upper_threshold'] = config['replica_policy'].get( - 'qps_upper_threshold', None) - service_config['qps_lower_threshold'] = config['replica_policy'].get( - 'qps_lower_threshold', None) - service_config['post_data'] = config['readiness_probe'].get( - 'post_data', None) + + readiness_section = config['readiness_probe'] + if isinstance(readiness_section, str): + service_config['readiness_path'] = readiness_section + initial_delay_seconds = None + post_data = None + else: + service_config['readiness_path'] = readiness_section['path'] + initial_delay_seconds = readiness_section.get( + 'initial_delay_seconds', None) + post_data = readiness_section.get('post_data', None) + if initial_delay_seconds is None: + ids = constants.DEFAULT_INITIAL_DELAY_SECONDS + initial_delay_seconds = ids + service_config['initial_delay_seconds'] = initial_delay_seconds + if isinstance(post_data, str): + try: + post_data = json.loads(post_data) + except json.JSONDecodeError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Invalid JSON string for `post_data` in the ' + '`readiness_probe` section of your service YAML.' + ) from e + service_config['post_data'] = post_data + + policy_section = config.get('replica_policy', None) + simplified_policy_section = config.get('replicas', None) + if policy_section is None or simplified_policy_section is not None: + if simplified_policy_section is not None: + min_replicas = simplified_policy_section + else: + min_replicas = constants.DEFAULT_MIN_REPLICAS + service_config['min_replicas'] = min_replicas + service_config['max_replicas'] = None + service_config['qps_upper_threshold'] = None + service_config['qps_lower_threshold'] = None + else: + service_config['min_replicas'] = policy_section['min_replicas'] + service_config['max_replicas'] = policy_section.get( + 'max_replicas', None) + service_config['qps_upper_threshold'] = policy_section.get( + 'qps_upper_threshold', None) + service_config['qps_lower_threshold'] = policy_section.get( + 'qps_lower_threshold', None) + + service_config['controller_resources'] = config.pop( + 'controller_resources', None) return SkyServiceSpec(**service_config) @@ -104,48 +161,67 @@ def add_if_not_none(section, key, value, no_empty: bool = False): config[section][key] = value add_if_not_none('port', None, int(self.app_port)) - add_if_not_none('readiness_probe', 'path', - self.readiness_path[len(f':{self.app_port}'):]) - add_if_not_none('readiness_probe', 'readiness_timeout', - self.readiness_timeout) + add_if_not_none('readiness_probe', 'path', self.readiness_path) + add_if_not_none('readiness_probe', 'initial_delay_seconds', + self.initial_delay_seconds) add_if_not_none('readiness_probe', 'post_data', self.post_data) - add_if_not_none('replica_policy', 'min_replica', self.min_replica) - add_if_not_none('replica_policy', 'max_replica', self.max_replica) + add_if_not_none('replica_policy', 'min_replicas', self.min_replicas) + add_if_not_none('replica_policy', 'max_replicas', self.max_replicas) add_if_not_none('replica_policy', 'qps_upper_threshold', self.qps_upper_threshold) add_if_not_none('replica_policy', 'qps_lower_threshold', self.qps_lower_threshold) + add_if_not_none('controller_resources', None, + self._controller_resources) return config + def probe_str(self): + if self.post_data is None: + return f'GET {self.readiness_path}' + return f'POST {self.readiness_path} {json.dumps(self.post_data)}' + def policy_str(self): - if self.max_replica == self.min_replica or self.max_replica is None: - plural = '' - if self.min_replica > 1: - plural = 'S' - return f'#REPLICA{plural}: {self.min_replica}' + min_plural = '' if self.min_replicas == 1 else 's' + if self.max_replicas == self.min_replicas or self.max_replicas is None: + return f'Fixed {self.min_replicas} replica{min_plural}' # TODO(tian): Refactor to contain more information - return f'AUTOSCALE [{self.min_replica}, {self.max_replica}]' + max_plural = '' if self.max_replicas == 1 else 's' + return (f'Autoscaling from {self.min_replicas} to ' + f'{self.max_replicas} replica{max_plural}') + + def __repr__(self) -> str: + return textwrap.dedent(f"""\ + Readiness probe method: {self.probe_str()} + Replica autoscaling policy: {self.policy_str()} + Service initial delay seconds: {self.initial_delay_seconds} + + Please refer to SkyPilot Serve document for detailed explanations. + """) + + @property + def readiness_suffix(self) -> str: + return f':{self._app_port}{self._readiness_path}' @property def readiness_path(self) -> str: return self._readiness_path @property - def readiness_timeout(self) -> int: - return self._readiness_timeout + def initial_delay_seconds(self) -> int: + return self._initial_delay_seconds @property def app_port(self) -> str: return self._app_port @property - def min_replica(self) -> int: - return self._min_replica + def min_replicas(self) -> int: + return self._min_replicas @property - def max_replica(self) -> Optional[int]: - return self._max_replica + def max_replicas(self) -> Optional[int]: + return self._max_replicas @property def qps_upper_threshold(self) -> Optional[float]: @@ -158,3 +234,7 @@ def qps_lower_threshold(self) -> Optional[float]: @property def post_data(self) -> Optional[Dict[str, Any]]: return self._post_data + + @property + def controller_resources(self) -> Optional[Dict[str, Any]]: + return self._controller_resources diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index c28fc671655..0d829ebc3bb 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -113,9 +113,6 @@ def parse_readme(readme: str) -> str: # Ray job has an issue with pydantic>2.0.0, due to API changes of pydantic. See # https://github.com/ray-project/ray/issues/36990 'pydantic<2.0', - # Required by the SkyServe library - 'uvicorn', - 'fastapi' ] # NOTE: Change the templates/spot-controller.yaml.j2 file if any of the @@ -147,6 +144,7 @@ def parse_readme(readme: str) -> str: 'cloudflare': aws_dependencies, 'scp': [], 'oci': ['oci'], + 'serve': ['uvicorn', 'fastapi'], } extras_require['all'] = sum(extras_require.values(), []) diff --git a/sky/status_lib.py b/sky/status_lib.py index 5ce63feda69..2d8d347ec35 100644 --- a/sky/status_lib.py +++ b/sky/status_lib.py @@ -54,19 +54,23 @@ class StorageStatus(enum.Enum): class ServiceStatus(enum.Enum): """Service status as recorded in table 'services'.""" - # Middleware is initializing + # Controller is initializing CONTROLLER_INIT = 'CONTROLLER_INIT' - # Replica is initializing + # Replica is initializing and no failure REPLICA_INIT = 'REPLICA_INIT' + # Controller failed to initialize / control plane or redirector jobs + # status abnormal + CONTRLLER_FAILED = 'CONTROLLER_FAILED' + # At least one replica is ready READY = 'READY' # Service is being shutting down SHUTTING_DOWN = 'SHUTTING_DOWN' - # At least one replica is failed + # At least one replica is failed and no replica is ready FAILED = 'FAILED' def colored_str(self): @@ -77,6 +81,7 @@ def colored_str(self): _SERVICE_STATUS_TO_COLOR = { ServiceStatus.CONTROLLER_INIT: colorama.Fore.BLUE, ServiceStatus.REPLICA_INIT: colorama.Fore.BLUE, + ServiceStatus.CONTRLLER_FAILED: colorama.Fore.RED, ServiceStatus.READY: colorama.Fore.GREEN, ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, ServiceStatus.FAILED: colorama.Fore.RED, @@ -86,26 +91,48 @@ def colored_str(self): class ReplicaStatus(enum.Enum): """Replica status.""" - # Replica is initializing - INIT = 'INIT' + # The replica VM is being provisioned. i.e., the `sky.launch` is still + # running. + PROVISIONING = 'PROVISIONING' - # Replica is running + # The replica VM is provisioned and the service is starting. This indicates + # user's `setup` section or `run` section is still running, and the + # readiness probe fails. + STARTING = 'STARTING' + + # The replica VM is provisioned and the service is ready, i.e. the + # readiness probe is passed. READY = 'READY' - # Replica is unhealthy (e.g., health probe failed) - UNHEALTHY = 'UNHEALTHY' + # The service was ready before, but it becomes not ready now, i.e. the + # readiness probe fails. + NOT_READY = 'NOT_READY' - # Replica is failed + # The replica VM is being shut down. i.e., the `sky down` is still running. + SHUTTING_DOWN = 'SHUTTING_DOWN' + + # The replica VM is once failed and has been deleted. FAILED = 'FAILED' + # `sky.down` failed during service teardown. This could mean resource + # leakage. + FAILED_CLEANUP = 'FAILED_CLEANUP' + + # Unknown status. This should never happen. + UNKNOWN = 'UNKNOWN' + def colored_str(self): color = _REPLICA_STATUS_TO_COLOR[self] return f'{color}{self.value}{colorama.Style.RESET_ALL}' _REPLICA_STATUS_TO_COLOR = { - ReplicaStatus.INIT: colorama.Fore.BLUE, + ReplicaStatus.PROVISIONING: colorama.Fore.BLUE, + ReplicaStatus.STARTING: colorama.Fore.CYAN, ReplicaStatus.READY: colorama.Fore.GREEN, - ReplicaStatus.UNHEALTHY: colorama.Fore.YELLOW, + ReplicaStatus.NOT_READY: colorama.Fore.YELLOW, + ReplicaStatus.FAILED_CLEANUP: colorama.Fore.RED, + ReplicaStatus.SHUTTING_DOWN: colorama.Fore.MAGENTA, ReplicaStatus.FAILED: colorama.Fore.RED, + ReplicaStatus.UNKNOWN: colorama.Fore.RED, } diff --git a/sky/templates/skyserve-controller.yaml.j2 b/sky/templates/skyserve-controller.yaml.j2 index a750c01e899..230c515b585 100644 --- a/sky/templates/skyserve-controller.yaml.j2 +++ b/sky/templates/skyserve-controller.yaml.j2 @@ -1,27 +1,12 @@ -resources: - cloud: gcp - disk_size: 100 - ports: -{%- for port in ports %} - - {{port}} -{%- endfor %} +# The template for skyserve controller -# {% if workdir is not none %} -# workdir: {{workdir}} -# {% endif %} +setup: | + # Install all serve dependencies. + pip install skypilot[serve] > /dev/null 2>&1 + + # Shutdown jupyter service that is default enabled on our GCP Deep + # Learning Image. This is to avoid port conflict on 8080. + sudo systemctl stop jupyter > /dev/null 2>&1 || true file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} - -envs: - # skip cloud identity check for serve controller to avoid the overhead. - SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK: 1 -{% if is_dev %} - SKYPILOT_DEV: 1 -{% endif %} -{% if is_debug %} - SKYPILOT_DEBUG: 1 -{% endif %} -{% if disable_logging %} - SKYPILOT_DISABLE_USAGE_COLLECTION: 1 -{% endif %} diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 541425985e4..810b57a990d 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -13,6 +13,7 @@ from sky.utils import log_utils COMMAND_TRUNC_LENGTH = 25 +REPLICA_TRUNC_NUM = 10 NUM_COST_REPORT_LINES = 5 # A record in global_user_state's 'clusters' table. @@ -113,14 +114,13 @@ def show_status_table(cluster_records: List[_ClusterRecord], def show_service_table(service_records: List[_ServiceRecord], show_all: bool): status_columns = [ StatusColumn('NAME', _get_name), + StatusColumn('UPTIME', _get_uptime), + StatusColumn('STATUS', _get_service_status_colored), + StatusColumn('REPLICAS', _get_replicas), StatusColumn('CONTROLLER_CLUSTER_NAME', _get_controller_cluster_name, show_by_default=False), StatusColumn('ENDPOINT', _get_endpoint), - StatusColumn('#READY_REPLICAS', _get_ready_replicas), - StatusColumn('#UNHEALTHY_REPLICAS', _get_unhealthy_replicas), - StatusColumn('#FAILED_REPLICAS', _get_failed_replicas), - StatusColumn('STATUS', _get_service_status_colored), StatusColumn('POLICY', _get_policy, show_by_default=False), StatusColumn('REQUESTED_RESOURCES', _get_requested_resources, @@ -146,14 +146,22 @@ def show_service_table(service_records: List[_ServiceRecord], show_all: bool): def show_replica_table(replica_records: List[_ReplicaRecord], show_all: bool): status_columns = [ - StatusColumn('NAME', _get_name), + StatusColumn('SERVICE_NAME', _get_service_name), + StatusColumn('ID', _get_replica_id), StatusColumn('RESOURCES', - _get_resources, + _get_replica_resources, trunc_length=70 if not show_all else 0), - StatusColumn('REGION', _get_region), + StatusColumn('REGION', _get_replica_region), + StatusColumn('ZONE', _get_replica_zone, show_by_default=False), StatusColumn('STATUS', _get_status_colored), ] + truncate_hint = '' + if not show_all: + if len(replica_records) > REPLICA_TRUNC_NUM: + truncate_hint = '... (use --all to show all replicas)\n' + replica_records = replica_records[:REPLICA_TRUNC_NUM] + columns = [] for status_column in status_columns: if status_column.show_by_default or show_all: @@ -169,6 +177,7 @@ def show_replica_table(replica_records: List[_ReplicaRecord], show_all: bool): click.echo(replica_table) else: click.echo('No existing replicas.') + click.echo(truncate_hint, nl=False) def get_total_cost_of_displayed_records( @@ -371,18 +380,36 @@ def show_local_status_table(local_clusters: List[str]): _get_command = (lambda cluster_record: cluster_record['last_use']) _get_duration = (lambda cluster_record: log_utils.readable_time_duration( 0, cluster_record['duration'], absolute=True)) +_get_replica_id = lambda service_record: service_record['replica_id'] _get_controller_cluster_name = ( lambda service_record: service_record['controller_cluster_name']) -_get_endpoint = (lambda service_record: service_record['endpoint']) -_get_ready_replicas = ( - lambda service_record: service_record['num_ready_replicas']) -_get_unhealthy_replicas = ( - lambda service_record: service_record['num_unhealthy_replicas']) -_get_failed_replicas = ( - lambda service_record: service_record['num_failed_replicas']) _get_policy = (lambda service_record: service_record['policy']) _get_requested_resources = ( lambda service_record: service_record['requested_resources']) +_get_service_name = (lambda service_record: service_record['service_name']) + + +def _get_uptime(service_record: _ServiceRecord) -> str: + uptime = service_record['uptime'] + if uptime is None: + return '-' + return log_utils.readable_time_duration(uptime, absolute=True) + + +def _get_replicas(service_record: _ServiceRecord) -> str: + ready_replica_num = 0 + for info in service_record['replica_info']: + if _get_status(info) == status_lib.ReplicaStatus.READY: + ready_replica_num += 1 + total_replica_num = len(service_record['replica_info']) + return f'{ready_replica_num}/{total_replica_num}' + + +def _get_endpoint(service_record: _ServiceRecord) -> str: + endpoint = service_record['endpoint'] + if not endpoint: + return '-' + return endpoint def _get_service_status( @@ -433,6 +460,27 @@ def _get_zone(cluster_record: _ClusterRecord) -> str: return zone_str +def _get_replica_resources(cluster_record: _ClusterRecord) -> str: + handle = cluster_record['handle'] + if handle is None: + return '-' + return _get_resources(cluster_record) + + +def _get_replica_region(cluster_record: _ClusterRecord) -> str: + handle = cluster_record['handle'] + if handle is None: + return '-' + return _get_region(cluster_record) + + +def _get_replica_zone(cluster_record: _ClusterRecord) -> str: + handle = cluster_record['handle'] + if handle is None: + return '-' + return _get_zone(cluster_record) + + def _get_autostop(cluster_record: _ClusterRecord) -> str: autostop_str = '' separation = '' diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 6668907a4f0..cf4f8ddb1dd 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -141,41 +141,45 @@ def get_service_schema(): return { '$schema': 'http://json-schema.org/draft-07/schema#', 'type': 'object', - 'required': ['port', 'readiness_probe', 'replica_policy'], + 'required': ['port', 'readiness_probe'], 'additionalProperties': False, 'properties': { 'port': { 'type': 'integer', }, 'readiness_probe': { - 'type': 'object', - 'required': ['path', 'readiness_timeout'], - 'additionalProperties': False, - 'properties': { - 'path': { - 'type': 'string', - }, - 'readiness_timeout': { - 'type': 'number', - }, - 'post_data': { - 'anyOf': [{ + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'object', + 'required': ['path'], + 'additionalProperties': False, + 'properties': { + 'path': { 'type': 'string', - }, { - 'type': 'object', - }] + }, + 'initial_delay_seconds': { + 'type': 'number', + }, + 'post_data': { + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'object', + }] + } } - } + }] }, 'replica_policy': { 'type': 'object', - 'required': ['min_replica'], + 'required': ['min_replicas'], 'additionalProperties': False, 'properties': { - 'min_replica': { + 'min_replicas': { 'type': 'integer', }, - 'max_replica': { + 'max_replicas': { 'type': 'integer', }, 'qps_upper_threshold': { @@ -185,7 +189,14 @@ def get_service_schema(): 'type': 'number', }, } - } + }, + 'replicas': { + 'type': 'integer', + }, + # resources config is validated separately using RESOURCES_SCHEMA + 'controller_resources': { + 'type': 'object', + }, } } From d6bd068d7d021dba3dfca63a20941b602ce10a09 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 15 Aug 2023 11:35:07 -0700 Subject: [PATCH 010/223] [SkyServe] Final changes for v0 release (#2396) * add vicuna v1.5 example * add replica ip in table; rename some vars * warning if sky launch a service yaml * format * start progress after error log * fix type name * log format * logger with skylogging format * dump user app fail to control plane log * ux * add launched_at and service_yaml to local DB; delete cloud storage locally * rapid bootstraping * format * move skyserve controller to separate section in sky status * add hint to see detailed sky serve status * restore example * rename control plane to controller * rename to hello_skyserve * rename to hello_skyserve * change port to align doc * inline controller failed checking * override user resources parameter * format * add some todos * remove redundant return * use handle to store information * fix error const name * simplify resources representation * check cluster status earlier * minor * minor * add back service section since we still need it in controller * restore vicuna example * print all info when use sky serve status -a * better handling of unknown status * add warning for status that cannot be sky.down * minor comment fixes * remove Tip: to reuse an existing cluster * enable extra port on controller * more detailed info when acc is None * Apply suggestions from code review Co-authored-by: Wei-Lin Chiang * add doc string --------- Co-authored-by: Wei-Lin Chiang --- sky/__init__.py | 4 +- sky/backends/backend_utils.py | 71 +++++------ sky/backends/cloud_vm_ray_backend.py | 8 +- sky/cli.py | 84 ++++++++----- sky/core.py | 2 +- sky/data/storage.py | 34 +++-- sky/execution.py | 115 +++++++++-------- sky/global_user_state.py | 118 +++++++----------- sky/serve/__init__.py | 15 ++- sky/serve/autoscalers.py | 17 +-- sky/serve/constants.py | 7 +- sky/serve/{control_plane.py => controller.py} | 55 ++++---- .../hello_skyserve.yaml} | 4 +- .../index.html | 0 sky/serve/examples/llama2/chat.py | 3 +- sky/serve/examples/tgi_coder.yaml | 1 + sky/serve/examples/vicuna-v1.5.yaml | 40 ++++++ sky/serve/infra_providers.py | 54 ++++++-- sky/serve/load_balancers.py | 7 +- sky/serve/redirector.py | 49 ++++---- sky/serve/serve_utils.py | 108 +++++++++++++--- sky/serve/service_spec.py | 15 +-- sky/status_lib.py | 6 +- sky/utils/cli_utils/status_utils.py | 89 +++++++++---- 24 files changed, 548 insertions(+), 358 deletions(-) rename sky/serve/{control_plane.py => controller.py} (76%) rename sky/serve/examples/{http_minimal/http_minimal.yaml => hello_skyserve/hello_skyserve.yaml} (58%) rename sky/serve/examples/{http_minimal => hello_skyserve}/index.html (100%) create mode 100644 sky/serve/examples/vicuna-v1.5.yaml diff --git a/sky/__init__.py b/sky/__init__.py index 0bc5d4ded23..9e3a42dcbdc 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -33,9 +33,9 @@ from sky.data import StoreType from sky.execution import exec # pylint: disable=redefined-builtin from sky.execution import launch -from sky.execution import spot_launch -from sky.execution import serve_up from sky.execution import serve_down +from sky.execution import serve_up +from sky.execution import spot_launch from sky.optimizer import Optimizer from sky.optimizer import OptimizeTarget from sky.resources import Resources diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index ed6045e2727..9bf0f112e03 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -36,10 +36,10 @@ from sky import exceptions from sky import global_user_state from sky import provision as provision_lib +from sky import serve as serve_lib from sky import sky_logging from sky import skypilot_config from sky import spot as spot_lib -from sky import serve as serve_lib from sky import status_lib from sky.backends import onprem_utils from sky.skylet import constants @@ -2612,42 +2612,48 @@ def _service_status_from_replica_info( # If one replica is READY, the service is READY. if status2num[status_lib.ReplicaStatus.READY] > 0: return status_lib.ServiceStatus.READY - if (status2num[status_lib.ReplicaStatus.FAILED] + - status2num[status_lib.ReplicaStatus.FAILED_CLEANUP] > 0): + if sum(status2num[status] + for status in status_lib.ReplicaStatus.failed_statuses()) > 0: return status_lib.ServiceStatus.FAILED return status_lib.ServiceStatus.REPLICA_INIT -def _check_controller_status_and_set_service_status( - service_name: str, cluster_name: str) -> Optional[str]: - cluster_record = global_user_state.get_cluster_from_name(cluster_name) - if (cluster_record is None or - cluster_record['status'] != status_lib.ClusterStatus.UP): - global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) - return f'Controller cluster {cluster_name!r} is not found or UP.' - return None - - def _refresh_service_record_no_lock( service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + """Refresh the service, and return the possibly updated record. + + Args: + service_name: The name of the service. + + Returns: + A tuple of a possibly updated record and an error message if any error + occurred when refreshing the service. + """ record = global_user_state.get_service_from_name(service_name) if record is None: return None, None + service_handle: serve_lib.ServiceHandle = record['handle'] try: check_network_connection() except exceptions.NetworkError: return record, 'Failed to refresh replica info due to network error.' - if not record['endpoint']: + if not service_handle.endpoint: # Service controller is still initializing. Skipped refresh status. return record, None - controller_cluster_name = record['controller_cluster_name'] - handle = global_user_state.get_handle_from_cluster_name( + controller_cluster_name = service_handle.controller_cluster_name + cluster_record = global_user_state.get_cluster_from_name( controller_cluster_name) - assert handle is not None + if (cluster_record is None or + cluster_record['status'] != status_lib.ClusterStatus.UP): + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + return record, (f'Controller cluster {controller_cluster_name!r} ' + 'is not found or UP.') + + handle = cluster_record['handle'] backend = get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) @@ -2659,37 +2665,23 @@ def _refresh_service_record_no_lock( stream_logs=False, separate_stderr=True) if returncode != 0: - # If we cannot get the latest info, there are two possibilities: - # 1. The controller is not in a healthy state; - # 2. The control plane process somehow not respond to the request. - # For the first case, we want to catch the error and set the service - # status to CONTROLLER_FAILED. - # TODO(tian): Since we disabled sky down the controller, we might could - # assert cluster status is UP here and remove this function. - msg = _check_controller_status_and_set_service_status( - record['name'], controller_cluster_name) - if msg is None: - msg = ('Failed to refresh replica info from the controller. ' - f'Using the cached record. Reason: {stderr}') - return record, msg + return record, ('Failed to refresh replica info from the controller. ' + f'Using the cached record. Reason: {stderr}') latest_info = serve_lib.load_latest_info(latest_info_payload) - record['replica_info'] = latest_info['replica_info'] - record['uptime'] = latest_info['uptime'] + service_handle.replica_info = latest_info['replica_info'] + service_handle.uptime = latest_info['uptime'] - msg = None # When the service is shutting down, there is a period of time which the - # control plane still responds to the request, and the replica is not + # controller still responds to the request, and the replica is not # terminated, so the return value for _service_status_from_replica_info # will still be READY, but we don't want change service status to READY. if record['status'] != status_lib.ServiceStatus.SHUTTING_DOWN: - new_status = _service_status_from_replica_info( + record['status'] = _service_status_from_replica_info( latest_info['replica_info']) - record['status'] = new_status global_user_state.add_or_update_service(**record) - - return record, msg + return record, None def _refresh_service_record( @@ -2731,6 +2723,7 @@ def _refresh_service(service_name: str) -> Optional[Dict[str, Any]]: print( f'{colorama.Fore.YELLOW}Error occurred when refreshing service ' f'{service_name}: {msg}{colorama.Style.RESET_ALL}') + progress.start() progress.update(task, advance=1) return record diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 45454c4d85d..c37e3e169fb 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -31,10 +31,10 @@ from sky import optimizer from sky import provision as provision_lib from sky import resources as resources_lib +from sky import serve as serve_lib from sky import sky_logging from sky import skypilot_config from sky import spot as spot_lib -from sky import serve as serve_lib from sky import status_lib from sky import task as task_lib from sky.backends import backend_utils @@ -3093,8 +3093,8 @@ def _exec_code_on_head( f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') elif not name.startswith(serve_lib.CONTROLLER_PREFIX): - # Skip logging for submit control plane & redirector jobs - # to controller + # Skip logging for submit controller & redirector jobs + # to skyserve controller cluster logger.info(f'{fore.CYAN}Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' '\nTo cancel the job:\t' @@ -4044,7 +4044,7 @@ def _check_existing_cluster( f'{cluster_name!r} [Username: {ssh_user}].' f'{colorama.Style.RESET_ALL}\n' 'Run `sky status` to see existing clusters.') - else: + elif not cluster_name.startswith(serve_lib.CONTROLLER_PREFIX): logger.info( f'{colorama.Fore.CYAN}Creating a new cluster: "{cluster_name}" ' f'[{task.num_nodes}x {to_provision}].' diff --git a/sky/cli.py b/sky/cli.py index 506fd18a2d3..3c33fbbb40a 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -51,9 +51,9 @@ from sky import core from sky import exceptions from sky import global_user_state +from sky import serve as serve_lib from sky import sky_logging from sky import spot as spot_lib -from sky import serve as serve_lib from sky import status_lib from sky.backends import backend_utils from sky.backends import onprem_utils @@ -1420,6 +1420,14 @@ def launch( with ux_utils.print_exception_no_traceback(): raise ValueError(f'{backend_name} backend is not supported.') + if task.service is not None: + logger.info( + f'{colorama.Fore.YELLOW}Service section will be ignored when using ' + f'`sky launch`. {colorama.Style.RESET_ALL}\n{colorama.Fore.YELLOW}' + 'To spin up a service, use SkyServe CLI: ' + f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up' + f'{colorama.Style.RESET_ALL}') + _launch_with_confirm(task, backend, cluster, @@ -1729,12 +1737,22 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]): refresh=refresh) nonreserved_cluster_records = [] reserved_clusters = [] + # TODO(tian): Rename this variable if other reserved prefix are added. + skyserve_controllers = [] for cluster_record in cluster_records: cluster_name = cluster_record['name'] if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: reserved_clusters.append(cluster_record) else: - nonreserved_cluster_records.append(cluster_record) + is_skyserve_controller = False + for prefix in backend_utils.SKY_RESERVED_CLUSTER_PREFIXES: + if cluster_name.startswith(prefix): + is_skyserve_controller = True + break + if is_skyserve_controller: + skyserve_controllers.append(cluster_record) + else: + nonreserved_cluster_records.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( suppress_error=True) @@ -1744,6 +1762,14 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]): status_utils.show_local_status_table(local_clusters) hints = [] + if skyserve_controllers: + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}\n' + f'SkyServe Controllers{colorama.Style.RESET_ALL}') + status_utils.show_status_table(skyserve_controllers, all) + hints.append( + f'* To see detailed service status: {colorama.Style.BRIGHT}' + f'sky serve status{colorama.Style.RESET_ALL}') + if show_spot_jobs: click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Managed spot jobs{colorama.Style.RESET_ALL}') @@ -4014,30 +4040,22 @@ def serve_up( raise ValueError( 'Specifying ports in resources is not allowed. SkyServe will ' 'use the port specified in the service section.') - return + controller_resources_config = copy.copy(serve_lib.CONTROLLER_RESOURCES) if task.service.controller_resources is not None: - controller_resources_config = task.service.controller_resources - else: - controller_resources_config = serve_lib.CONTROLLER_RESOURCES + controller_resources_config.update(task.service.controller_resources) try: controller_resources = sky.Resources.from_yaml_config( controller_resources_config) except ValueError as e: raise ValueError( 'Encountered error when parsing controller resources') from e - if controller_resources.ports is not None: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Cannot specify ports in controller resources. SkyServe ' - 'will use the port specified in the service section.') - return click.secho('Service Spec:', fg='cyan') click.echo(task.service) dummy_controller_task = sky.Task().set_resources(controller_resources) - click.secho('The controller will use the following resources:', fg='cyan') + click.secho('The controller will use the following resource:', fg='cyan') with sky.Dag() as dag: dag.add(dummy_controller_task) sky.optimize(dag) @@ -4083,12 +4101,12 @@ def serve_status(all: bool, service_name: Optional[str]): - ``CONTROLLER_INIT``: The controller is initializing. - - ``REPLICA_INIT``: The controller provisioning have succeeded; control - plane and redirector is alive, and there are no available replicas for + - ``REPLICA_INIT``: The controller provisioning have succeeded; controller + and redirector process is alive, and there are no available replicas for now. This also indicates that no replica failure has been detected. - ``CONTROLLER_FAILED``: The controller failed to start or in an abnormal - state; or the control plane and redirector is not alive. + state; or the controller and redirector process is not alive. - ``READY``: The controller is ready to serve requests. This means that at least one replica have passed the readiness probe. @@ -4159,7 +4177,8 @@ def serve_status(all: bool, service_name: Optional[str]): f'Replicas{colorama.Style.RESET_ALL}') replica_infos = [] for service_record in service_records: - for replica_record in service_record['replica_info']: + service_handle: serve_lib.ServiceHandle = service_record['handle'] + for replica_record in service_handle.replica_info: replica_record['service_name'] = service_record['name'] replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) @@ -4193,7 +4212,7 @@ def serve_down( yes: bool, purge: bool, ): - """Tear down service(s). + """Teardown service(s). SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If both SERVICE_NAMES and ``--all`` are supplied, the latter takes precedence. @@ -4286,11 +4305,11 @@ def _down_service(name: str): default=True, help=('Follow the logs of the job. [default: --follow] ' 'If --no-follow is specified, print the log so far and exit.')) -@click.option('--control-plane', +@click.option('--controller', is_flag=True, default=False, required=False, - help='Show the control plane logs of this service.') + help='Show the controller logs of this service.') @click.option('--redirector', is_flag=True, default=False, @@ -4305,7 +4324,7 @@ def _down_service(name: str): def serve_logs( service_name: str, follow: bool, - control_plane: bool, + controller: bool, redirector: bool, replica_id: Optional[int], ): @@ -4315,8 +4334,8 @@ def serve_logs( .. code-block:: bash - # Tail the control plane logs of a service - sky serve logs --control-plane [SERVICE_ID] + # Tail the controller logs of a service + sky serve logs --controller [SERVICE_ID] \b # Print the redirector logs so far and exit sky serve logs --redirector --no-follow [SERVICE_ID] @@ -4325,22 +4344,19 @@ def serve_logs( sky serve logs [SERVICE_ID] 1 """ have_replica_id = replica_id is not None - if (control_plane + redirector + have_replica_id) != 1: - click.secho( - 'Only one of --control-plane, --redirector, --replica-id ' - 'can be specified. See `sky serve logs --help` for more ' - 'information.', - fg='red') - return + if (controller + redirector + have_replica_id) != 1: + raise click.UsageError( + 'One and only one of --controller, --redirector, ' + '[REPLICA_ID] can be specified.') service_record = global_user_state.get_service_from_name(service_name) if service_record is None: click.secho(f'Service {service_name!r} not found.', fg='red') return - controller_name = service_record['controller_cluster_name'] - if control_plane: - core.tail_logs(controller_name, job_id=1, follow=follow) + controller_cluster_name = service_record['handle'].controller_cluster_name + if controller: + core.tail_logs(controller_cluster_name, job_id=1, follow=follow) elif redirector: - core.tail_logs(controller_name, job_id=2, follow=follow) + core.tail_logs(controller_cluster_name, job_id=2, follow=follow) else: core.serve_tail_logs(service_record, replica_id, follow=follow) diff --git a/sky/core.py b/sky/core.py index c2ee7a9a2ed..f5cdba63031 100644 --- a/sky/core.py +++ b/sky/core.py @@ -127,7 +127,7 @@ def serve_tail_logs(service_record: Dict[str, Any], replica_id: int, with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service {service_name!r}\'s controller failed. ' 'Cannot tail logs.') - controller_cluster_name = service_record['controller_cluster_name'] + controller_cluster_name = service_record['handle'].controller_cluster_name handle = global_user_state.get_handle_from_cluster_name( controller_cluster_name) if handle is None: diff --git a/sky/data/storage.py b/sky/data/storage.py index 34f14c48eff..b0f3bcc90af 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -277,7 +277,7 @@ def upload(self) -> None: """ raise NotImplementedError - def delete(self) -> None: + def delete(self, silent: bool = False) -> None: """Removes the Storage object from the cloud.""" raise NotImplementedError @@ -495,7 +495,7 @@ def __init__(self, (isinstance(self.source, list) or not data_utils.is_cloud_store_url(self.source))): msg = ' and uploading from source' - logger.info(f'Verifying bucket{msg} for storage {self.name}') + logger.debug(f'Verifying bucket{msg} for storage {self.name}') self.sync_all_stores() else: @@ -728,7 +728,7 @@ def add_store(self, store_type: Union[str, StoreType]) -> AbstractStore: store_type = StoreType(store_type) if store_type in self.stores: - logger.info(f'Storage type {store_type} already exists.') + logger.debug(f'Storage type {store_type} already exists.') return self.stores[store_type] store_cls: Type[AbstractStore] @@ -788,7 +788,9 @@ def _add_store(self, store: AbstractStore, is_reconstructed: bool = False): global_user_state.add_or_update_storage(self.name, self.handle, StorageStatus.INIT) - def delete(self, store_type: Optional[StoreType] = None) -> None: + def delete(self, + store_type: Optional[StoreType] = None, + silent: bool = False) -> None: """Deletes data for all sky-managed storage objects. If a storage is not managed by sky, it is not deleted from the cloud. @@ -808,7 +810,7 @@ def delete(self, store_type: Optional[StoreType] = None) -> None: # remove handle and return if is_sky_managed: self.handle.remove_store(store) - store.delete() + store.delete(silent=silent) # Check remaining stores - if none is sky managed, remove # the storage from global_user_state. delete = all( @@ -818,16 +820,16 @@ def delete(self, store_type: Optional[StoreType] = None) -> None: else: global_user_state.set_storage_handle(self.name, self.handle) elif self.force_delete: - store.delete() + store.delete(silent=silent) # Remove store from bookkeeping del self.stores[store_type] else: for _, store in self.stores.items(): if store.is_sky_managed: self.handle.remove_store(store) - store.delete() + store.delete(silent=silent) elif self.force_delete: - store.delete() + store.delete(silent=silent) self.stores = {} # Remove storage from global_user_state if present global_user_state.remove_storage(self.name) @@ -1088,8 +1090,10 @@ def upload(self): raise exceptions.StorageUploadError( f'Upload failed for store {self.name}') from e - def delete(self) -> None: + def delete(self, silent: bool = False) -> None: deleted_by_skypilot = self._delete_s3_bucket(self.name) + if silent: + return if deleted_by_skypilot: msg_str = f'Deleted S3 bucket {self.name}.' else: @@ -1486,8 +1490,10 @@ def upload(self): raise exceptions.StorageUploadError( f'Upload failed for store {self.name}') from e - def delete(self) -> None: + def delete(self, silent: bool = False) -> None: deleted_by_skypilot = self._delete_gcs_bucket(self.name) + if silent: + return if deleted_by_skypilot: msg_str = f'Deleted GCS bucket {self.name}.' else: @@ -1858,8 +1864,10 @@ def upload(self): raise exceptions.StorageUploadError( f'Upload failed for store {self.name}') from e - def delete(self) -> None: + def delete(self, silent: bool = False) -> None: deleted_by_skypilot = self._delete_r2_bucket(self.name) + if silent: + return if deleted_by_skypilot: msg_str = f'Deleted R2 bucket {self.name}.' else: @@ -2263,8 +2271,10 @@ def upload(self): raise exceptions.StorageUploadError( f'Upload failed for store {self.name}') from e - def delete(self) -> None: + def delete(self, silent: bool = False) -> None: self._delete_cos_bucket() + if silent: + return logger.info(f'{colorama.Fore.GREEN}Deleted COS bucket {self.name}.' f'{colorama.Style.RESET_ALL}') diff --git a/sky/execution.py b/sky/execution.py index 111247bc120..eead8afb688 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -15,16 +15,15 @@ import copy import enum import getpass -import requests -from rich import console as rich_console -import tempfile -import time import os import tempfile +import time from typing import Any, Dict, List, Optional, Union import uuid import colorama +import requests +from rich import console as rich_console import sky from sky import backends @@ -33,10 +32,10 @@ from sky import exceptions from sky import global_user_state from sky import optimizer +from sky import serve from sky import sky_logging from sky import skypilot_config from sky import spot -from sky import serve from sky import status_lib from sky import task as task_lib from sky.backends import backend_utils @@ -965,39 +964,40 @@ def serve_up( service_name: str, controller_best_resources: 'sky.Resources', ): - """Serve up a service. + """Spin up a service. Please refer to the sky.cli.serve_up for the document. Args: task: sky.Task to serve up. - name: Name of the RESTful API. - - Raises: + service_name: Name of the service. + controller_best_resources: The optimized resources for the controller. """ - controller_cluster_name = serve.CONTROLLER_PREFIX + service_name + controller_cluster_name = serve.generate_controller_cluster_name( + service_name) assert task.service is not None, task - policy = task.service.policy_str() - assert len(task.resources) == 1 + assert len(task.resources) == 1, task requested_resources = list(task.resources)[0] + service_handle = serve.ServiceHandle( + controller_cluster_name=controller_cluster_name, + policy=task.service.policy_str(), + requested_resources=requested_resources, + replica_info=[]) global_user_state.add_or_update_service( - service_name, None, controller_cluster_name, '', - status_lib.ServiceStatus.CONTROLLER_INIT, policy, requested_resources, - []) + service_name, None, service_handle, + status_lib.ServiceStatus.CONTROLLER_INIT) app_port = int(task.service.app_port) - assert len(task.resources) == 1, task - original_resources = list(task.resources)[0] - if original_resources.ports is not None and (len(original_resources.ports) - != 1): - if original_resources.ports[0] != app_port: - logger.warning('Ignoring port specification ' - f'{original_resources.ports} in resources.') - task.set_resources(original_resources.copy(ports=[app_port])) + task.set_resources(requested_resources.copy(ports=[app_port])) # TODO(tian): Use skyserve constants. - # The storage will be cleaned up by the control plane `terminate` method - # after the service is terminated. _maybe_translate_local_file_mounts_and_sync_up(task) + ephemeral_storage = [] + if task.storage_mounts is not None: + for storage in task.storage_mounts.values(): + if not storage.persistent: + ephemeral_storage.append(storage.to_yaml_config()) + service_handle.ephemeral_storage = ephemeral_storage + global_user_state.set_service_handle(service_name, service_handle) with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', mode='w') as f: @@ -1006,20 +1006,24 @@ def serve_up( 'resources']: del task_config['resources']['spot_recovery'] common_utils.dump_yaml(f.name, task_config) - remote_task_yaml_path = (serve.SERVICE_YAML_PREFIX + - f'/service_{service_name}.yaml') + remote_task_yaml_path = serve.generate_remote_task_yaml_file_name( + service_name) vars_to_fill = { 'remote_task_yaml_path': remote_task_yaml_path, 'local_task_yaml_path': f.name, } - controller_yaml_path = os.path.join(serve.CONTROLLER_YAML_PREFIX, - f'{service_name}.yaml') + controller_yaml_path = serve.generate_controller_yaml_file_name( + service_name) backend_utils.fill_template(serve.CONTROLLER_TEMPLATE, vars_to_fill, output_path=controller_yaml_path) controller_task = task_lib.Task.from_yaml(controller_yaml_path) + ports = [app_port] + # TODO(tian): We might need a thorough design on this. + if controller_best_resources.ports is not None: + ports.extend(controller_best_resources.ports) controller_task.best_resources = (controller_best_resources.copy( - ports=[app_port])) + ports=ports)) controller_envs = { 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': True, @@ -1054,6 +1058,8 @@ def serve_up( handle = cluster_record['handle'] assert isinstance(handle, backends.CloudVmRayResourceHandle) endpoint = f'{handle.head_ip}:{task.service.app_port}' + service_handle.endpoint = endpoint + global_user_state.set_service_handle(service_name, service_handle) console = rich_console.Console() @@ -1070,46 +1076,44 @@ def _wait_until_job_is_running(cluster_name: str, return False # NOTICE: The job submission order cannot be changed since the - # `sky serve logs` CLI will identify the control plane job with + # `sky serve logs` CLI will identify the controller job with # the first job submitted and the redirector job with the second # job submitted. - with console.status('[yellow]Launching control plane process on ' - 'controller...[/yellow]'): + with console.status('[yellow]Launching controller process...[/yellow]'): _execute( entrypoint=sky.Task( - name='run-control-plane', + name='run-controller', envs=controller_envs, - run='python -m sky.serve.control_plane --service-name ' + run='python -m sky.serve.controller --service-name ' f'{service_name} --task-yaml {remote_task_yaml_path} ' - f'--port {serve.CONTROL_PLANE_PORT}'), + f'--port {serve.CONTROLLER_PORT}'), stream_logs=False, handle=handle, stages=[Stage.EXEC], cluster_name=controller_cluster_name, detach_run=True, ) - control_plane_job_is_running = _wait_until_job_is_running( + controller_job_is_running = _wait_until_job_is_running( controller_cluster_name, 1) - if not control_plane_job_is_running: + if not controller_job_is_running: global_user_state.set_service_status( service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) - print(f'{colorama.Fore.RED}Control plane failed to launch. ' + print(f'{colorama.Fore.RED}Controller failed to launch. ' f'Please check the logs with sky serve logs {service_name} ' - f'--control-plane{colorama.Style.RESET_ALL}') + f'--controller{colorama.Style.RESET_ALL}') return - print(f'{colorama.Fore.GREEN}Control plane process is running.' + print(f'{colorama.Fore.GREEN}Launching controller process...done.' f'{colorama.Style.RESET_ALL}') - with console.status('[yellow]Launching redirector process on ' - 'controller...[/yellow]'): - control_plane_addr = f'http://localhost:{serve.CONTROL_PLANE_PORT}' + with console.status('[yellow]Launching redirector process...[/yellow]'): + controller_addr = f'http://localhost:{serve.CONTROLLER_PORT}' _execute( entrypoint=sky.Task( name='run-redirector', envs=controller_envs, run='python -m sky.serve.redirector --task-yaml ' f'{remote_task_yaml_path} --port {app_port} ' - f'--control-plane-addr {control_plane_addr}'), + f'--controller-addr {controller_addr}'), stream_logs=False, handle=handle, stages=[Stage.EXEC], @@ -1125,12 +1129,11 @@ def _wait_until_job_is_running(cluster_name: str, f'Please check the logs with sky serve logs {service_name} ' f'--redirector{colorama.Style.RESET_ALL}') return - print(f'{colorama.Fore.GREEN}Redirector process is running.' + print(f'{colorama.Fore.GREEN}Launching redirector process...done.' f'{colorama.Style.RESET_ALL}') global_user_state.set_service_status( service_name, status_lib.ServiceStatus.REPLICA_INIT) - global_user_state.set_service_endpoint(service_name, endpoint) print(f'\n{colorama.Fore.CYAN}Service name: ' f'{colorama.Style.BRIGHT}{service_name}{colorama.Style.RESET_ALL}' @@ -1138,7 +1141,7 @@ def _wait_until_job_is_running(cluster_name: str, f'\t\t{backend_utils.BOLD}sky serve status {service_name} (-a)' f'{backend_utils.RESET_BOLD}' '\nTo see logs of controller:' - f'\t{backend_utils.BOLD}sky serve logs --control-plane ' + f'\t{backend_utils.BOLD}sky serve logs --controller ' f'{service_name}{backend_utils.RESET_BOLD}' '\nTo see logs of redirector:' f'\t{backend_utils.BOLD}sky serve logs --redirector ' @@ -1171,13 +1174,12 @@ def serve_down( Args: service_name: Name of the service. - purge: If true, ignore errors when cleaning up the controller. """ service_record = global_user_state.get_service_from_name(service_name) # Already filered all inexist service in cli.py assert service_record is not None, service_name - controller_cluster_name = service_record['controller_cluster_name'] + controller_cluster_name = service_record['handle'].controller_cluster_name global_user_state.set_service_status(service_name, status_lib.ServiceStatus.SHUTTING_DOWN) handle = global_user_state.get_handle_from_cluster_name( @@ -1197,8 +1199,8 @@ def serve_down( try: subprocess_utils.handle_returncode( returncode, - code, - f'Failed to terminate service {service_name}', + code, ('Failed when submit terminate request to controller ' + f'of service {service_name}'), stderr, stream_logs=False) except exceptions.CommandError as e: @@ -1233,7 +1235,7 @@ def serve_down( core.cancel(controller_cluster_name, all=True, _from_serve_core=True) except (ValueError, sky.exceptions.ClusterNotUpError) as e: if purge: - logger.warning('Ignoring error when stopping control plane and ' + logger.warning('Ignoring error when stopping controller and ' f'redirector jobs of service {service_name}: {e}') else: raise RuntimeError() from e @@ -1248,8 +1250,11 @@ def serve_down( raise RuntimeError() from e # TODO(tian): Maybe add a post_cleanup function? - controller_yaml_path = os.path.join(serve.CONTROLLER_YAML_PREFIX, - f'{service_name}.yaml') + controller_yaml_path = serve.generate_controller_yaml_file_name( + service_name) if os.path.exists(controller_yaml_path): os.remove(controller_yaml_path) + handle = global_user_state.get_handle_from_service_name(service_name) + assert handle is not None + handle.cleanup_ephemeral_storage() global_user_state.remove_service(service_name) diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 19d73475238..98acdfd9eb3 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -25,8 +25,8 @@ if typing.TYPE_CHECKING: from sky import backends + from sky import serve from sky.data import Storage - from sky import resources as resources_lib _ENABLED_CLOUDS_KEY = 'enabled_clouds' @@ -97,13 +97,9 @@ def create_table(cursor, conn): cursor.execute("""\ CREATE TABLE IF NOT EXISTS services ( name TEXT PRIMARY KEY, - uptime INTEGER, - controller_cluster_name TEXT, - endpoint TEXT, - status TEXT, - policy TEXT, - requested_resources BLOB, - replica_info BLOB)""") + launched_at INTEGER, + handle BLOB, + status TEXT)""") # For backward compatibility. # TODO(zhwu): Remove this function after all users have migrated to # the latest version of SkyPilot. @@ -284,49 +280,33 @@ def add_or_update_cluster(cluster_name: str, _DB.conn.commit() -def add_or_update_service( - name: str, uptime: Optional[int], controller_cluster_name: str, - endpoint: str, status: status_lib.ServiceStatus, policy: str, - requested_resources: Optional['resources_lib.Resources'], - replica_info: List[Dict[str, Any]]) -> None: +def add_or_update_service(name: str, launched_at: Optional[int], + handle: 'serve.ServiceHandle', + status: status_lib.ServiceStatus) -> None: + if launched_at is None: + launched_at = int(time.time()) _DB.cursor.execute( 'INSERT or REPLACE INTO services' - '(name, uptime, controller_cluster_name, endpoint, ' - 'status, policy, requested_resources, replica_info) ' + '(name, launched_at, handle, status) ' 'VALUES (' # name '?, ' - # uptime - '?, ' - # controller_cluster_name + # launched_at '?, ' - # endpoint + # handle '?, ' # status - '?, ' - # policy - '?, ' - # requested_resources - '?, ' - # replica_info '?' ')', ( # name name, - # uptime - uptime, - # controller_cluster_name - controller_cluster_name, - # endpoint - endpoint, + # launched_at + launched_at, + # handle + pickle.dumps(handle), # status status.value, - # policy - policy, - # requested_resources - pickle.dumps(requested_resources), - pickle.dumps(replica_info), )) _DB.conn.commit() @@ -388,9 +368,9 @@ def set_service_status(service_name: str, status: status_lib.ServiceStatus): raise ValueError(f'Service {service_name} not found.') -def set_service_endpoint(service_name: str, endpoint: str): - _DB.cursor.execute('UPDATE services SET endpoint=(?) ' - 'WHERE name=(?)', (endpoint, service_name)) +def set_service_handle(service_name: str, handle: 'serve.ServiceHandle'): + _DB.cursor.execute('UPDATE services SET handle=(?) ' + 'WHERE name=(?)', (pickle.dumps(handle), service_name)) count = _DB.cursor.rowcount _DB.conn.commit() assert count <= 1, count @@ -626,28 +606,35 @@ def get_cluster_from_name( return None +def _get_service_from_row(row) -> Dict[str, Any]: + # Explicitly specify the number of fields to unpack, so that + # we can add new fields to the database in the future without + # breaking the previous code. + name, launched_at, handle, status = row[:4] + # TODO: use namedtuple instead of dict + return { + 'name': name, + 'launched_at': launched_at, + 'handle': pickle.loads(handle), + 'status': status_lib.ServiceStatus[status], + } + + def get_service_from_name( service_name: Optional[str]) -> Optional[Dict[str, Any]]: rows = _DB.cursor.execute('SELECT * FROM services WHERE name=(?)', (service_name,)).fetchall() for row in rows: - # Explicitly specify the number of fields to unpack, so that - # we can add new fields to the database in the future without - # breaking the previous code. - (name, uptime, controller_cluster_name, endpoint, status, policy, - requested_resources, replica_info) = row[:8] - # TODO: use namedtuple instead of dict - record = { - 'name': name, - 'uptime': uptime, - 'controller_cluster_name': controller_cluster_name, - 'endpoint': endpoint, - 'status': status_lib.ServiceStatus[status], - 'policy': policy, - 'requested_resources': pickle.loads(requested_resources), - 'replica_info': pickle.loads(replica_info), - } - return record + return _get_service_from_row(row) + return None + + +def get_handle_from_service_name( + service_name: Optional[str]) -> Optional['serve.ServiceHandle']: + rows = _DB.cursor.execute('SELECT handle FROM services WHERE name=(?)', + (service_name,)).fetchall() + for (handle,) in rows: + return pickle.loads(handle) return None @@ -678,24 +665,11 @@ def get_clusters() -> List[Dict[str, Any]]: def get_services() -> List[Dict[str, Any]]: - rows = _DB.cursor.execute('select * from services').fetchall() + rows = _DB.cursor.execute( + 'select * from services order by launched_at desc').fetchall() records = [] for row in rows: - (name, uptime, controller_cluster_name, endpoint, status, policy, - requested_resources, replica_info) = row[:8] - # TODO: use namedtuple instead of dict - - record = { - 'name': name, - 'uptime': uptime, - 'controller_cluster_name': controller_cluster_name, - 'endpoint': endpoint, - 'status': status_lib.ServiceStatus[status], - 'policy': policy, - 'requested_resources': pickle.loads(requested_resources), - 'replica_info': pickle.loads(replica_info), - } - + record = _get_service_from_row(row) records.append(record) return records diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index c8779ec3f1b..75f228ab47a 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -1,8 +1,13 @@ """Modules for SkyServe services.""" -from sky.serve.constants import (CONTROLLER_PREFIX, CONTROLLER_TEMPLATE, - CONTROLLER_YAML_PREFIX, SERVICE_YAML_PREFIX, - CONTROL_PLANE_PORT, CONTROLLER_RESOURCES) -from sky.serve.service_spec import SkyServiceSpec -from sky.serve.serve_utils import ServeCodeGen +from sky.serve.constants import CONTROLLER_PORT +from sky.serve.constants import CONTROLLER_PREFIX +from sky.serve.constants import CONTROLLER_RESOURCES +from sky.serve.constants import CONTROLLER_TEMPLATE +from sky.serve.serve_utils import generate_controller_cluster_name +from sky.serve.serve_utils import generate_controller_yaml_file_name +from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import load_terminate_service_result +from sky.serve.serve_utils import ServeCodeGen +from sky.serve.serve_utils import ServiceHandle +from sky.serve.service_spec import SkyServiceSpec diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 314e186d92c..415f6e0eb1e 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -4,11 +4,16 @@ import time from typing import Optional -from sky.serve import infra_providers from sky.serve import constants +from sky.serve import infra_providers logger = logging.getLogger(__name__) +# Since sky.launch is very resource demanding, we limit the number of +# concurrent sky.launch process to avoid overloading the machine. +# TODO(tian): determine this value based on controller resources. +_MAX_BOOTSTRAPING_NUM = 5 + class Autoscaler: """Abstract class for autoscalers.""" @@ -23,9 +28,9 @@ def __init__(self, # Default to fixed node, i.e. min_nodes == max_nodes. self.max_nodes: int = max_nodes or min_nodes self.frequency = frequency # Time to sleep in seconds. - if frequency < constants.CONTROL_PLANE_SYNC_INTERVAL: + if frequency < constants.CONTROLLER_SYNC_INTERVAL: logger.warning('Autoscaler frequency is less than ' - 'control plane sync interval. It might ' + 'controller sync interval. It might ' 'not always got the latest information.') def evaluate_scaling(self) -> None: @@ -114,15 +119,13 @@ def evaluate_scaling(self) -> None: if num_nodes else num_requests_per_second) logger.info(f'Requests per node: {requests_per_node}') - # logger.info(f'Upper threshold: {self.upper_threshold} qps/node, ' - # f'lower threshold: {self.lower_threshold} qps/node, ' - # f'queries per node: {requests_per_node} qps/node') # Bootstrap case logger.info(f'Number of nodes: {num_nodes}') if num_nodes < self.min_nodes: logger.info('Bootstrapping service.') - self.scale_up(1) + self.scale_up(min(self.min_nodes - num_nodes, + _MAX_BOOTSTRAPING_NUM)) self.last_scale_operation = current_time elif (self.upper_threshold is not None and requests_per_node > self.upper_threshold): diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 3d4ba54cfd4..567cdec6be8 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -3,12 +3,11 @@ CONTROLLER_PREFIX = 'controller-' CONTROLLER_TEMPLATE = 'skyserve-controller.yaml.j2' -CONTROLLER_YAML_PREFIX = '~/.sky/serve' -SERVICE_YAML_PREFIX = '~/.sky/service' +SERVE_PREFIX = '~/.sky/serve' -CONTROL_PLANE_PORT = 31001 -CONTROL_PLANE_SYNC_INTERVAL = 20 +CONTROLLER_PORT = 31001 +CONTROLLER_SYNC_INTERVAL = 20 CONTROLLER_RESOURCES = {'disk_size': 100, 'cpus': '4+'} diff --git a/sky/serve/control_plane.py b/sky/serve/controller.py similarity index 76% rename from sky/serve/control_plane.py rename to sky/serve/controller.py index 3d69f5f872f..3bdf8d3592a 100644 --- a/sky/serve/control_plane.py +++ b/sky/serve/controller.py @@ -1,28 +1,26 @@ -"""Control Plane: the central control plane of SkyServe. +"""Controller: the central controller of SkyServe. Responsible for autoscaling and replica management. """ import argparse import base64 -import fastapi import logging import pickle from typing import Optional + +import fastapi import uvicorn -import sky -from sky import backends from sky import serve +from sky import sky_logging from sky.serve import autoscalers from sky.serve import infra_providers from sky.utils import env_options -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s | %(levelname)-6s | %(name)-40s || %(message)s', - datefmt='%m-%d %H:%M:%S', - force=True) -logger = logging.getLogger(__name__) +# Use the explicit logger name so that the logger is under the +# `sky.serve.controller` namespace when executed directly, so as +# to inherit the setup from the `sky` logger. +logger = sky_logging.init_logger('sky.serve.controller') class SuppressSuccessGetAccessLogsFilter(logging.Filter): @@ -32,8 +30,8 @@ def filter(self, record: logging.LogRecord) -> bool: return not ('GET' in message and '200' in message) -class ControlPlane: - """Control Plane: control everything about replica. +class Controller: + """Controller: control everything about replica. This class is responsible for: - Starting and terminating the replica monitor and autoscaler. @@ -42,18 +40,16 @@ class ControlPlane: def __init__(self, port: int, - task_yaml: str, infra_provider: infra_providers.InfraProvider, autoscaler: Optional[autoscalers.Autoscaler] = None) -> None: self.port = port - self.task_yaml = task_yaml self.infra_provider = infra_provider self.autoscaler = autoscaler self.app = fastapi.FastAPI() def run(self) -> None: - @self.app.post('/control_plane/update_num_requests') + @self.app.post('/controller/update_num_requests') async def update_num_requests(request: fastapi.Request): # await request request_data = await request.json() @@ -64,17 +60,17 @@ async def update_num_requests(request: fastapi.Request): self.autoscaler.set_num_requests(num_requests) return {'message': 'Success'} - @self.app.get('/control_plane/get_autoscaler_query_interval') + @self.app.get('/controller/get_autoscaler_query_interval') def get_autoscaler_query_interval(): if isinstance(self.autoscaler, autoscalers.RequestRateAutoscaler): return {'query_interval': self.autoscaler.get_query_interval()} return {'query_interval': None} - @self.app.get('/control_plane/get_ready_replicas') + @self.app.get('/controller/get_ready_replicas') def get_ready_replicas(): return {'ready_replicas': self.infra_provider.get_ready_replicas()} - @self.app.get('/control_plane/get_latest_info') + @self.app.get('/controller/get_latest_info') def get_latest_info(): latest_info = { 'replica_info': @@ -87,7 +83,7 @@ def get_latest_info(): } return latest_info - @self.app.post('/control_plane/terminate') + @self.app.post('/controller/terminate') def terminate(request: fastapi.Request): del request logger.info('Terminating service...') @@ -95,12 +91,6 @@ def terminate(request: fastapi.Request): logger.info('Terminate autoscaler...') self.autoscaler.terminate() msg = self.infra_provider.terminate() - # Cleanup cloud storage - # TODO(tian): move to local serve_down so that we can cleanup - # local storage cache as well. - task = sky.Task.from_yaml(self.task_yaml) - backend = backends.CloudVmRayBackend() - backend.teardown_ephemeral_storage(task) return {'message': msg} # Run replica_prober and autoscaler (if autoscaler is defined) @@ -111,18 +101,18 @@ def terminate(request: fastapi.Request): self.autoscaler.start() # Disable all GET logs if SKYPILOT_DEBUG is not set to avoid overflood - # the control plane logs. + # the controller logs. if not env_options.Options.SHOW_DEBUG_INFO.get(): logging.getLogger('uvicorn.access').addFilter( SuppressSuccessGetAccessLogsFilter()) logger.info( - f'SkyServe Control Plane started on http://localhost:{self.port}') + f'SkyServe Controller started on http://localhost:{self.port}') uvicorn.run(self.app, host='localhost', port=self.port) if __name__ == '__main__': - parser = argparse.ArgumentParser(description='SkyServe Control Plane') + parser = argparse.ArgumentParser(description='SkyServe Controller') parser.add_argument('--service-name', type=str, help='Name of the service', @@ -134,7 +124,7 @@ def terminate(request: fastapi.Request): parser.add_argument('--port', '-p', type=int, - help='Port to run the control plane', + help='Port to run the controller', required=True) args = parser.parse_args() @@ -158,7 +148,6 @@ def terminate(request: fastapi.Request): cooldown=60, query_interval=60) - # ======= ControlPlane ========= - control_plane = ControlPlane(args.port, args.task_yaml, _infra_provider, - _autoscaler) - control_plane.run() + # ======= Controller ========= + controller = Controller(args.port, _infra_provider, _autoscaler) + controller.run() diff --git a/sky/serve/examples/http_minimal/http_minimal.yaml b/sky/serve/examples/hello_skyserve/hello_skyserve.yaml similarity index 58% rename from sky/serve/examples/http_minimal/http_minimal.yaml rename to sky/serve/examples/hello_skyserve/hello_skyserve.yaml index d3419b25c26..1e76fc24a12 100644 --- a/sky/serve/examples/http_minimal/http_minimal.yaml +++ b/sky/serve/examples/hello_skyserve/hello_skyserve.yaml @@ -1,5 +1,5 @@ service: - port: 9090 + port: 8080 readiness_probe: / resources: @@ -7,4 +7,4 @@ resources: workdir: . -run: python3 -m http.server 9090 +run: python3 -m http.server 8080 diff --git a/sky/serve/examples/http_minimal/index.html b/sky/serve/examples/hello_skyserve/index.html similarity index 100% rename from sky/serve/examples/http_minimal/index.html rename to sky/serve/examples/hello_skyserve/index.html diff --git a/sky/serve/examples/llama2/chat.py b/sky/serve/examples/llama2/chat.py index 2f450479851..2e353da7268 100644 --- a/sky/serve/examples/llama2/chat.py +++ b/sky/serve/examples/llama2/chat.py @@ -1,6 +1,7 @@ -import requests import json + import openai +import requests stream = True model = "Llama-2-7b-chat-hf" diff --git a/sky/serve/examples/tgi_coder.yaml b/sky/serve/examples/tgi_coder.yaml index b6247490bc5..a64bf41f226 100644 --- a/sky/serve/examples/tgi_coder.yaml +++ b/sky/serve/examples/tgi_coder.yaml @@ -6,5 +6,6 @@ service: readiness_probe: /health replicas: 2 +# TODO(tian): Maybe use some small model like 3b. run: | docker run --gpus all --shm-size 1g -p 8082:80 -v ~/data:/data ghcr.io/huggingface/text-generation-inference --model-id WizardLM/WizardCoder-15B-V1.0 diff --git a/sky/serve/examples/vicuna-v1.5.yaml b/sky/serve/examples/vicuna-v1.5.yaml new file mode 100644 index 00000000000..a46f75b49ca --- /dev/null +++ b/sky/serve/examples/vicuna-v1.5.yaml @@ -0,0 +1,40 @@ +resources: + accelerators: A100:1 + disk_size: 1024 + disk_tier: high + +service: + port: 8087 + readiness_probe: /v1/models + replicas: 2 + +envs: + MODEL_SIZE: 13 + +setup: | + conda activate chatbot + if [ $? -ne 0 ]; then + conda create -n chatbot python=3.9 -y + conda activate chatbot + fi + + # Install dependencies + pip install git+https://github.com/lm-sys/FastChat.git + pip install transformers torch accelerate sentencepiece + +run: | + conda activate chatbot + + echo 'Starting controller...' + python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + sleep 10 + echo 'Starting model worker...' + python -u -m fastchat.serve.model_worker \ + --model-path lmsys/vicuna-${MODEL_SIZE}b-v1.5 2>&1 \ + | tee model_worker.log & + + echo 'Waiting for model worker to start...' + while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting openai api server server...' + python -u -m fastchat.serve.openai_api_server --host 0.0.0.0 --port 8087 | tee ~/openai_api_server.log diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 9c56054aece..b00a5d388e3 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -4,12 +4,13 @@ import logging import os import random -import requests import signal import subprocess import threading import time -from typing import List, Dict, Set, Optional, Any, Union, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +import requests from sky import backends from sky import core @@ -111,15 +112,18 @@ def to_replica_status(self) -> status_lib.ReplicaStatus: return status_lib.ReplicaStatus.UNKNOWN if self.sky_launch_status == ProcessStatus.FAILED: # sky.launch failed - # down process should have been started - return status_lib.ReplicaStatus.UNKNOWN + # Down process should have been started. + # If not started, this means some bug prevent sky.down from + # executing. It is also a potential resource leak, so we mark + # it as FAILED_CLEANUP. + return status_lib.ReplicaStatus.FAILED_CLEANUP if self.service_ready_now: # Service is ready return status_lib.ReplicaStatus.READY if self.user_app_failed: # Failed on user setup/run - # down process should have been started - return status_lib.ReplicaStatus.UNKNOWN + # Same as above + return status_lib.ReplicaStatus.FAILED_CLEANUP if self.service_once_ready: # Service was ready before but not now return status_lib.ReplicaStatus.NOT_READY @@ -260,7 +264,7 @@ def _refresh_process_pool(self) -> None: 'Terminating...') info.status_property.sky_launch_status = ( ProcessStatus.FAILED) - self._teardown_cluster(cluster_name) + self._teardown_cluster(cluster_name, sync_down_logs=True) else: info.status_property.sky_launch_status = ( ProcessStatus.SUCCESS) @@ -322,9 +326,20 @@ def _fetch_job_status(self) -> None: job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP ]: info.status_property.user_app_failed = True - logger.info(f'User APP for cluster {cluster_name} FAILED. ' - 'Terminating...') - self._teardown_cluster(cluster_name) + logger.warning(f'User APP for cluster {cluster_name} FAILED. ' + 'Start streaming logs...') + backend = backends.CloudVmRayBackend() + handle = info.handle + assert handle is not None, info + # Always tail the logs of the first job, which represent user + # setup & run. + try: + backend.tail_logs(handle, job_id=1, follow=False) + except Exception as e: # pylint: disable=broad-except + logger.error(f'Error in streaming logs for cluster ' + f'{cluster_name}: {e}') + logger.info('Terminating...') + self._teardown_cluster(cluster_name, sync_down_logs=True) def _job_status_fetcher(self) -> None: while not self.job_status_fetcher_stop_event.is_set(): @@ -418,7 +433,15 @@ def _teardown_cluster(self, local_log_file_name = ( serve_utils.generate_replica_local_log_file_name(cluster_name)) with open(local_log_file_name, 'w') as f: - subprocess.run(code, shell=True, check=True, stdout=f) + try: + subprocess.run(code, shell=True, check=True, stdout=f) + except Exception as e: # pylint: disable=broad-except + # No matter what error happens, we should teardown the + # cluster. + msg = ('Error in syncing down logs for cluster ' + f'{cluster_name}: {e}') + logger.error(msg) + print(msg, file=f) logger.info(f'Deleting SkyPilot cluster {cluster_name}') cmd = ['sky', 'down', cluster_name, '-y'] @@ -470,14 +493,21 @@ def terminate(self) -> Optional[str]: info = self.replica_info[name] # Set to success here for correctly display as shutting down info.status_property.sky_launch_status = ProcessStatus.SUCCESS + msg = [] for name, info in self.replica_info.items(): + if info.status in [ + status_lib.ReplicaStatus.FAILED_CLEANUP, + status_lib.ReplicaStatus.UNKNOWN, + ]: + msg.append(f'Cluster with status {info.status} found. Please ' + 'manually check the cloud console to make sure no ' + 'resource leak.') # Skip those already deleted and those are deleting if info.status not in [ status_lib.ReplicaStatus.FAILED, status_lib.ReplicaStatus.SHUTTING_DOWN ]: self._teardown_cluster(name, sync_down_logs=False) - msg = [] for name, p in self.down_process_pool.items(): p.wait() logger.info(f'Down process for cluster {name} finished.') diff --git a/sky/serve/load_balancers.py b/sky/serve/load_balancers.py index 3b831ca4a3e..60915fe3ae3 100644 --- a/sky/serve/load_balancers.py +++ b/sky/serve/load_balancers.py @@ -1,9 +1,10 @@ """LoadBalancer: select endpoint by load balancing algorithm.""" from collections import deque -import fastapi -import time import logging -from typing import Optional, Deque, Set +import time +from typing import Deque, Optional, Set + +import fastapi logger = logging.getLogger(__name__) diff --git a/sky/serve/redirector.py b/sky/serve/redirector.py index 7af4b6ce6b6..a94819bc7f0 100644 --- a/sky/serve/redirector.py +++ b/sky/serve/redirector.py @@ -1,22 +1,20 @@ """Redirector: redirect any incoming request to an endpoint replica.""" import argparse -import fastapi -import logging import threading import time -import uvicorn + +import fastapi import requests +import uvicorn +from sky import sky_logging from sky.serve import constants from sky.serve import load_balancers -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s | %(levelname)-6s | %(name)-40s || %(message)s', - datefmt='%m-%d %H:%M:%S', - # force=True, -) -logger = logging.getLogger(__name__) +# Use the explicit logger name so that the logger is under the +# `sky.serve.redirector` namespace when executed directly, so as +# to inherit the setup from the `sky` logger. +logger = sky_logging.init_logger('sky.serve.redirector') class SkyServeRedirector: @@ -26,16 +24,16 @@ class SkyServeRedirector: to the appropriate endpoint replica. """ - def __init__(self, control_plane_url: str, port: int, + def __init__(self, controller_url: str, port: int, load_balancer: load_balancers.LoadBalancer): self.app = fastapi.FastAPI() - self.control_plane_url = control_plane_url + self.controller_url = controller_url self.port = port self.load_balancer = load_balancer for i in range(3): - resp = requests.get(self.control_plane_url + - '/control_plane/get_autoscaler_query_interval') + resp = requests.get(self.controller_url + + '/controller/get_autoscaler_query_interval') if resp.status_code == 200: self.load_balancer.set_query_interval( resp.json()['query_interval']) @@ -46,14 +44,13 @@ def __init__(self, control_plane_url: str, port: int, self.load_balancer.set_query_interval(None) time.sleep(10) - def _sync_with_control_plane(self): + def _sync_with_controller(self): while True: with requests.Session() as session: try: # send request num in last query interval response = session.post( - self.control_plane_url + - '/control_plane/update_num_requests', + self.controller_url + '/controller/update_num_requests', json={ 'num_requests': self.load_balancer.deprecate_old_requests() @@ -61,8 +58,8 @@ def _sync_with_control_plane(self): timeout=5) response.raise_for_status() # get replica ips - response = session.get(self.control_plane_url + - '/control_plane/get_ready_replicas') + response = session.get(self.controller_url + + '/controller/get_ready_replicas') response.raise_for_status() ready_replicas = response.json()['ready_replicas'] except requests.RequestException as e: @@ -70,7 +67,7 @@ def _sync_with_control_plane(self): else: logger.info(f'Available Replica IPs: {ready_replicas}') self.load_balancer.set_ready_replicas(ready_replicas) - time.sleep(constants.CONTROL_PLANE_SYNC_INTERVAL) + time.sleep(constants.CONTROLLER_SYNC_INTERVAL) async def _redirector_handler(self, request: fastapi.Request): self.load_balancer.increment_request_count(1) @@ -91,9 +88,9 @@ def run(self): self._redirector_handler, methods=['GET', 'POST', 'PUT', 'DELETE']) - sync_control_plane_thread = threading.Thread( - target=self._sync_with_control_plane, daemon=True) - sync_control_plane_thread.start() + sync_controller_thread = threading.Thread( + target=self._sync_with_controller, daemon=True) + sync_controller_thread.start() logger.info( f'SkyServe Redirector started on http://0.0.0.0:{self.port}') @@ -113,9 +110,9 @@ def run(self): type=int, help='Port to run the redirector on.', required=True) - parser.add_argument('--control-plane-addr', + parser.add_argument('--controller-addr', type=str, - help='Control plane address (ip:port).', + help='Controller address (ip:port).', required=True) args = parser.parse_args() @@ -123,7 +120,7 @@ def run(self): _load_balancer = load_balancers.RoundRobinLoadBalancer() # ======= Redirector ========= - redirector = SkyServeRedirector(control_plane_url=args.control_plane_addr, + redirector = SkyServeRedirector(controller_url=args.controller_addr, port=args.port, load_balancer=_load_balancer) redirector.run() diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index eb8a4c71c05..88ff2d71860 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -1,21 +1,27 @@ """User interface with the SkyServe.""" import base64 -import colorama import os import pickle import re -import requests import shlex import time -from typing import Any, Dict, List, Optional, Iterator, TextIO, Callable +import typing +from typing import Any, Callable, Dict, Iterator, List, Optional, TextIO + +import colorama +import requests from sky import backends from sky import global_user_state -from sky.serve import constants from sky import status_lib +from sky.data import storage as storage_lib +from sky.serve import constants from sky.utils import common_utils -_CONTROL_PLANE_URL = f'http://localhost:{constants.CONTROL_PLANE_PORT}' +if typing.TYPE_CHECKING: + import sky + +_CONTROLLER_URL = f'http://localhost:{constants.CONTROLLER_PORT}' _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*' _FAILED_TO_FIND_REPLICA_MSG = ( @@ -24,6 +30,23 @@ f' to check all valid replica id.{colorama.Style.RESET_ALL}') +def generate_controller_cluster_name(service_name: str) -> str: + return constants.CONTROLLER_PREFIX + service_name + + +def generate_remote_task_yaml_file_name(service_name: str) -> str: + service_name = service_name.replace('-', '_') + # Don't expand here since it is used for remote machine. + prefix = constants.SERVE_PREFIX + return os.path.join(prefix, f'{service_name}.yaml') + + +def generate_controller_yaml_file_name(service_name: str) -> str: + service_name = service_name.replace('-', '_') + prefix = os.path.expanduser(constants.SERVE_PREFIX) + return os.path.join(prefix, f'{service_name}_controller.yaml') + + def generate_replica_cluster_name(service_name: str, replica_id: int) -> str: return f'{service_name}-{replica_id}' @@ -34,24 +57,80 @@ def get_replica_id_from_cluster_name(cluster_name: str) -> int: def generate_replica_launch_log_file_name(cluster_name: str) -> str: cluster_name = cluster_name.replace('-', '_') - prefix = os.path.expanduser(constants.SERVICE_YAML_PREFIX) - return f'{prefix}/{cluster_name}_launch.log' + prefix = os.path.expanduser(constants.SERVE_PREFIX) + return os.path.join(prefix, f'{cluster_name}_launch.log') def generate_replica_down_log_file_name(cluster_name: str) -> str: cluster_name = cluster_name.replace('-', '_') - prefix = os.path.expanduser(constants.SERVICE_YAML_PREFIX) - return f'{prefix}/{cluster_name}_down.log' + prefix = os.path.expanduser(constants.SERVE_PREFIX) + return os.path.join(prefix, f'{cluster_name}_down.log') def generate_replica_local_log_file_name(cluster_name: str) -> str: cluster_name = cluster_name.replace('-', '_') - prefix = os.path.expanduser(constants.SERVICE_YAML_PREFIX) - return f'{prefix}/{cluster_name}_local.log' + prefix = os.path.expanduser(constants.SERVE_PREFIX) + return os.path.join(prefix, f'{cluster_name}_local.log') + + +class ServiceHandle(object): + """A pickle-able tuple of: + + - (required) Controller cluster name. + - (required) Service autoscaling policy descriotion str. + - (required) Service requested resources. + - (required) All replica info. + - (optional) Service uptime. + - (optional) Service endpoint URL. + - (optional) Epemeral storage generated for the service. + + This class is only used as a cache for information fetched from controller. + """ + _VERSION = 1 + + def __init__( + self, + *, + controller_cluster_name: str, + policy: str, + requested_resources: 'sky.Resources', + replica_info: List[Dict[str, Any]], + uptime: Optional[int] = None, + endpoint: Optional[str] = None, + ephemeral_storage: Optional[List[Dict[str, Any]]] = None) -> None: + self._version = self._VERSION + self.controller_cluster_name = controller_cluster_name + self.replica_info = replica_info + self.uptime = uptime + self.endpoint = endpoint + self.policy = policy + self.requested_resources = requested_resources + self.ephemeral_storage = ephemeral_storage + + def __repr__(self): + return ('ServiceHandle(' + f'\n\tcontroller_cluster_name={self.controller_cluster_name},' + f'\n\treplica_info={self.replica_info},' + f'\n\tuptime={self.uptime},' + f'\n\tendpoint={self.endpoint},' + f'\n\tpolicy={self.policy},' + f'\n\trequested_resources={self.requested_resources},' + f'\n\tephemeral_storage={self.ephemeral_storage})') + + def cleanup_ephemeral_storage(self) -> None: + if self.ephemeral_storage is None: + return + for storage_config in self.ephemeral_storage: + storage = storage_lib.Storage.from_yaml_config(storage_config) + storage.delete(silent=True) + + def __setsate__(self, state): + self._version = self._VERSION + self.__dict__.update(state) def get_latest_info() -> str: - resp = requests.get(_CONTROL_PLANE_URL + '/control_plane/get_latest_info') + resp = requests.get(_CONTROLLER_URL + '/controller/get_latest_info') if resp.status_code != 200: raise ValueError(f'Failed to get replica info: {resp.text}') return common_utils.encode_payload(resp.json()) @@ -66,7 +145,7 @@ def load_latest_info(payload: str) -> Dict[str, Any]: def terminate_service() -> str: - resp = requests.post(_CONTROL_PLANE_URL + '/control_plane/terminate') + resp = requests.post(_CONTROLLER_URL + '/controller/terminate') resp = base64.b64encode(pickle.dumps(resp)).decode('utf-8') return common_utils.encode_payload(resp) @@ -169,8 +248,7 @@ def stream_logs(service_name: str, f'{colorama.Style.RESET_ALL}') def _get_replica_status() -> status_lib.ReplicaStatus: - resp = requests.get(_CONTROL_PLANE_URL + - '/control_plane/get_latest_info') + resp = requests.get(_CONTROLLER_URL + '/controller/get_latest_info') if resp.status_code != 200: raise ValueError( f'{colorama.Fore.RED}Failed to get replica info for service ' diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index f05edfceb6b..db06ab79d28 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -1,9 +1,10 @@ """Service specification for SkyServe.""" -import os import json -import yaml +import os import textwrap -from typing import Optional, Dict, Any +from typing import Any, Dict, Optional + +import yaml from sky.backends import backend_utils from sky.serve import constants @@ -35,12 +36,12 @@ def __init__( raise ValueError( 'max_replicas must be greater than or equal to min_replicas' ) - if app_port == constants.CONTROL_PLANE_PORT: + if app_port == constants.CONTROLLER_PORT: with ux_utils.print_exception_no_traceback(): raise ValueError( - f'App port cannot be {constants.CONTROL_PLANE_PORT} ' - 'since it is reserved for the control plane. ' - ' Please use a different port.') + f'App port cannot be {constants.CONTROLLER_PORT} ' + 'since it is reserved for the controller. ' + 'Please use a different port.') if not readiness_path.startswith('/'): with ux_utils.print_exception_no_traceback(): raise ValueError('readiness_path must start with a slash (/). ' diff --git a/sky/status_lib.py b/sky/status_lib.py index 2d8d347ec35..78aa53447fa 100644 --- a/sky/status_lib.py +++ b/sky/status_lib.py @@ -60,7 +60,7 @@ class ServiceStatus(enum.Enum): # Replica is initializing and no failure REPLICA_INIT = 'REPLICA_INIT' - # Controller failed to initialize / control plane or redirector jobs + # Controller failed to initialize / controller or redirector process # status abnormal CONTRLLER_FAILED = 'CONTROLLER_FAILED' @@ -121,6 +121,10 @@ class ReplicaStatus(enum.Enum): # Unknown status. This should never happen. UNKNOWN = 'UNKNOWN' + @classmethod + def failed_statuses(cls): + return [cls.FAILED, cls.FAILED_CLEANUP, cls.UNKNOWN] + def colored_str(self): color = _REPLICA_STATUS_TO_COLOR[self] return f'{color}{self.value}{colorama.Style.RESET_ALL}' diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index c11ab426a64..d86d61eb160 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -1,5 +1,6 @@ """Utilities for sky status.""" import re +import typing from typing import Any, Callable, Dict, List, Optional import click @@ -12,6 +13,10 @@ from sky.utils import common_utils from sky.utils import log_utils +if typing.TYPE_CHECKING: + import sky + from sky import serve + COMMAND_TRUNC_LENGTH = 25 REPLICA_TRUNC_NUM = 10 NUM_COST_REPORT_LINES = 5 @@ -115,6 +120,7 @@ def show_status_table(cluster_records: List[_ClusterRecord], def show_service_table(service_records: List[_ServiceRecord], show_all: bool): status_columns = [ StatusColumn('NAME', _get_name), + StatusColumn('LAUNCHED', _get_launched, show_by_default=False), StatusColumn('UPTIME', _get_uptime), StatusColumn('STATUS', _get_service_status_colored), StatusColumn('REPLICAS', _get_replicas), @@ -149,9 +155,11 @@ def show_replica_table(replica_records: List[_ReplicaRecord], show_all: bool): status_columns = [ StatusColumn('SERVICE_NAME', _get_service_name), StatusColumn('ID', _get_replica_id), - StatusColumn('RESOURCES', - _get_replica_resources, - trunc_length=70 if not show_all else 0), + StatusColumn('IP', _get_head_ip), + StatusColumn( + 'RESOURCES', + _get_full_replica_resources if show_all else _get_replica_resources, + trunc_length=70 if not show_all else 0), StatusColumn('REGION', _get_replica_region), StatusColumn('ZONE', _get_replica_zone, show_by_default=False), StatusColumn('STATUS', _get_status_colored), @@ -381,17 +389,33 @@ def show_local_status_table(local_clusters: List[str]): _get_command = (lambda cluster_record: cluster_record['last_use']) _get_duration = (lambda cluster_record: log_utils.readable_time_duration( 0, cluster_record['duration'], absolute=True)) -_get_replica_id = lambda service_record: service_record['replica_id'] -_get_controller_cluster_name = ( - lambda service_record: service_record['controller_cluster_name']) -_get_policy = (lambda service_record: service_record['policy']) -_get_requested_resources = ( - lambda service_record: service_record['requested_resources']) -_get_service_name = (lambda service_record: service_record['service_name']) +_get_replica_id = lambda replica_record: replica_record['replica_id'] +_get_service_name = (lambda replica_record: replica_record['service_name']) + + +def _get_service_handle( + service_record: _ServiceRecord) -> 'serve.ServiceHandle': + return service_record['handle'] + + +def _get_controller_cluster_name(service_record: _ServiceRecord) -> str: + handle = _get_service_handle(service_record) + return handle.controller_cluster_name + + +def _get_policy(service_record: _ServiceRecord) -> str: + handle = _get_service_handle(service_record) + return handle.policy + + +def _get_requested_resources(service_record: _ServiceRecord) -> 'sky.Resources': + handle = _get_service_handle(service_record) + return handle.requested_resources def _get_uptime(service_record: _ServiceRecord) -> str: - uptime = service_record['uptime'] + handle = _get_service_handle(service_record) + uptime = handle.uptime if uptime is None: return '-' return log_utils.readable_time_duration(uptime, absolute=True) @@ -399,15 +423,17 @@ def _get_uptime(service_record: _ServiceRecord) -> str: def _get_replicas(service_record: _ServiceRecord) -> str: ready_replica_num = 0 - for info in service_record['replica_info']: + handle = _get_service_handle(service_record) + for info in handle.replica_info: if _get_status(info) == status_lib.ReplicaStatus.READY: ready_replica_num += 1 - total_replica_num = len(service_record['replica_info']) + total_replica_num = len(handle.replica_info) return f'{ready_replica_num}/{total_replica_num}' def _get_endpoint(service_record: _ServiceRecord) -> str: - endpoint = service_record['endpoint'] + handle = _get_service_handle(service_record) + endpoint = handle.endpoint if not endpoint: return '-' return endpoint @@ -461,25 +487,42 @@ def _get_zone(cluster_record: _ClusterRecord) -> str: return zone_str -def _get_replica_resources(cluster_record: _ClusterRecord) -> str: - handle = cluster_record['handle'] +def _get_full_replica_resources(replica_record: _ReplicaRecord) -> str: + handle = replica_record['handle'] if handle is None: return '-' - return _get_resources(cluster_record) + return _get_resources(replica_record) -def _get_replica_region(cluster_record: _ClusterRecord) -> str: - handle = cluster_record['handle'] +def _get_replica_resources(replica_record: _ReplicaRecord) -> str: + handle = replica_record['handle'] + if handle is None: + return '-' + assert isinstance(handle, backends.CloudVmRayResourceHandle) + cloud = handle.launched_resources.cloud + launched_resource_str = f'{cloud}' + if handle.launched_resources.accelerators is None: + vcpu, _ = cloud.get_vcpus_mem_from_instance_type( + handle.launched_resources.instance_type) + launched_resource_str += f'(vCPU={int(vcpu)})' + else: + launched_resource_str += f'({handle.launched_resources.accelerators})' + resources_str = (f'{handle.launched_nodes}x {launched_resource_str}') + return resources_str + + +def _get_replica_region(replica_record: _ReplicaRecord) -> str: + handle = replica_record['handle'] if handle is None: return '-' - return _get_region(cluster_record) + return _get_region(replica_record) -def _get_replica_zone(cluster_record: _ClusterRecord) -> str: - handle = cluster_record['handle'] +def _get_replica_zone(replica_record: _ReplicaRecord) -> str: + handle = replica_record['handle'] if handle is None: return '-' - return _get_zone(cluster_record) + return _get_zone(replica_record) def _get_autostop(cluster_record: _ClusterRecord) -> str: From 7968de9a2bfd0a1ae4e22ea1dae7959c8537a6a1 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Tue, 15 Aug 2023 14:04:34 -0700 Subject: [PATCH 011/223] fix serve example (#2406) fix --- sky/serve/examples/llama2/llama2.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sky/serve/examples/llama2/llama2.yaml b/sky/serve/examples/llama2/llama2.yaml index e36ecb7936e..4647b7b4eda 100644 --- a/sky/serve/examples/llama2/llama2.yaml +++ b/sky/serve/examples/llama2/llama2.yaml @@ -21,10 +21,7 @@ setup: | fi # Install dependencies - pip install git+https://github.com/lm-sys/FastChat.git - # Need the latest transformers to support 70B model - pip install git+https://github.com/huggingface/transformers.git - + pip install "fschat[model_worker,webui]==0.2.24" python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" run: | From 23e75298cdf9f9ee215e1513d2ac765293549e0a Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Tue, 15 Aug 2023 15:46:53 -0700 Subject: [PATCH 012/223] surface debug msg (#2407) * add msg * shorten * fix * add msg --- sky/cli.py | 2 ++ sky/execution.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 3c33fbbb40a..41a1a11cd47 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4275,6 +4275,8 @@ def _down_service(name: str): sky.serve_down(name, purge) except RuntimeError as e: message = (f'{colorama.Fore.RED}Teardown service {name}...failed. ' + 'Please manually clean up the replicas and then use ' + '--purge to clean up the controller.' f'{colorama.Style.RESET_ALL}' f'\nReason: {common_utils.format_exception(e)}.') except (exceptions.NotSupportedError, diff --git a/sky/execution.py b/sky/execution.py index eead8afb688..3a87bf196dd 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1223,7 +1223,7 @@ def serve_down( logger.warning('Ignoring error when cleaning replicas of ' f'{service_name}: {e}') else: - raise RuntimeError() from e + raise RuntimeError(e) from e else: if not purge: with ux_utils.print_exception_no_traceback(): @@ -1238,7 +1238,7 @@ def serve_down( logger.warning('Ignoring error when stopping controller and ' f'redirector jobs of service {service_name}: {e}') else: - raise RuntimeError() from e + raise RuntimeError(e) from e try: core.down(controller_cluster_name, purge=purge) @@ -1247,7 +1247,7 @@ def serve_down( logger.warning('Ignoring error when terminating controller VM of ' f'service {service_name}: {e}') else: - raise RuntimeError() from e + raise RuntimeError(e) from e # TODO(tian): Maybe add a post_cleanup function? controller_yaml_path = serve.generate_controller_yaml_file_name( From 57b3347f20765b2bc897c36886f14cbf20dc9d0a Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 15 Aug 2023 17:46:54 -0700 Subject: [PATCH 013/223] [SkyServe] Fix port failover (#2408) fix --- sky/cli.py | 9 ++++++++- sky/execution.py | 10 ++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 41a1a11cd47..17beb3ee80e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4035,15 +4035,22 @@ def serve_up( click.secho('Service section not found in the YAML file.', fg='red') return assert len(task.resources) == 1 - if list(task.resources)[0].ports is not None: + requested_resources = list(task.resources)[0] + if requested_resources.ports is not None: with ux_utils.print_exception_no_traceback(): raise ValueError( 'Specifying ports in resources is not allowed. SkyServe will ' 'use the port specified in the service section.') + app_port = int(task.service.app_port) + task.set_resources(requested_resources.copy(ports=[app_port])) controller_resources_config = copy.copy(serve_lib.CONTROLLER_RESOURCES) if task.service.controller_resources is not None: controller_resources_config.update(task.service.controller_resources) + # TODO(tian): We might need a thorough design on this. + if 'ports' not in controller_resources_config: + controller_resources_config['ports'] = [] + controller_resources_config['ports'].append(app_port) try: controller_resources = sky.Resources.from_yaml_config( controller_resources_config) diff --git a/sky/execution.py b/sky/execution.py index 3a87bf196dd..ea94b77f4d3 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -987,7 +987,6 @@ def serve_up( service_name, None, service_handle, status_lib.ServiceStatus.CONTROLLER_INIT) app_port = int(task.service.app_port) - task.set_resources(requested_resources.copy(ports=[app_port])) # TODO(tian): Use skyserve constants. _maybe_translate_local_file_mounts_and_sync_up(task) @@ -1018,12 +1017,7 @@ def serve_up( vars_to_fill, output_path=controller_yaml_path) controller_task = task_lib.Task.from_yaml(controller_yaml_path) - ports = [app_port] - # TODO(tian): We might need a thorough design on this. - if controller_best_resources.ports is not None: - ports.extend(controller_best_resources.ports) - controller_task.best_resources = (controller_best_resources.copy( - ports=ports)) + controller_task.best_resources = controller_best_resources controller_envs = { 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': True, @@ -1057,7 +1051,7 @@ def serve_up( handle = cluster_record['handle'] assert isinstance(handle, backends.CloudVmRayResourceHandle) - endpoint = f'{handle.head_ip}:{task.service.app_port}' + endpoint = f'{handle.head_ip}:{app_port}' service_handle.endpoint = endpoint global_user_state.set_service_handle(service_name, service_handle) From 0a1a7de8e6139eb3d9f5808cafe2a2b11bfbfd64 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Fri, 18 Aug 2023 09:23:28 -0700 Subject: [PATCH 014/223] [SkyServe] Introducing smoke test and fix bugs (#2411) * add gcp tests * add azure and aws test * fix cloud dependencies * use larger disk size to enable azure controller * mixed cloud test & install gcloud cli * format * fix * add prehook * minor & add smoke test function --- sky/cli.py | 3 +- sky/execution.py | 8 +++- sky/serve/__init__.py | 1 + sky/serve/constants.py | 4 +- sky/serve/examples/http_server/server.py | 15 ++++--- sky/serve/serve_utils.py | 11 +++++ sky/task.py | 7 +++ sky/templates/skyserve-controller.yaml.j2 | 12 ++--- tests/skyserve/http_aws.yaml | 16 +++++++ tests/skyserve/http_azure.yaml | 19 ++++++++ tests/skyserve/http_gcp.yaml | 17 ++++++++ tests/skyserve/http_mixed_cloud.yaml | 16 +++++++ tests/test_smoke.py | 53 ++++++++++++++++++++++- 13 files changed, 167 insertions(+), 15 deletions(-) create mode 100644 tests/skyserve/http_aws.yaml create mode 100644 tests/skyserve/http_azure.yaml create mode 100644 tests/skyserve/http_gcp.yaml create mode 100644 tests/skyserve/http_mixed_cloud.yaml diff --git a/sky/cli.py b/sky/cli.py index 17beb3ee80e..51c10ee92b5 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4044,7 +4044,8 @@ def serve_up( app_port = int(task.service.app_port) task.set_resources(requested_resources.copy(ports=[app_port])) - controller_resources_config = copy.copy(serve_lib.CONTROLLER_RESOURCES) + controller_resources_config: Dict[str, Any] = copy.copy( + serve_lib.CONTROLLER_RESOURCES) if task.service.controller_resources is not None: controller_resources_config.update(task.service.controller_resources) # TODO(tian): We might need a thorough design on this. diff --git a/sky/execution.py b/sky/execution.py index ea94b77f4d3..585f6abb74f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -998,6 +998,8 @@ def serve_up( service_handle.ephemeral_storage = ephemeral_storage global_user_state.set_service_handle(service_name, service_handle) + task.add_skyserve_prehook() + with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', mode='w') as f: task_config = task.to_yaml_config() @@ -1010,6 +1012,8 @@ def serve_up( vars_to_fill = { 'remote_task_yaml_path': remote_task_yaml_path, 'local_task_yaml_path': f.name, + 'google_sdk_installation_commands': + gcp.GOOGLE_SDK_INSTALLATION_COMMAND, } controller_yaml_path = serve.generate_controller_yaml_file_name( service_name) @@ -1017,6 +1021,7 @@ def serve_up( vars_to_fill, output_path=controller_yaml_path) controller_task = task_lib.Task.from_yaml(controller_yaml_path) + controller_task.add_skyserve_prehook() controller_task.best_resources = controller_best_resources controller_envs = { @@ -1227,7 +1232,8 @@ def serve_down( try: core.cancel(controller_cluster_name, all=True, _from_serve_core=True) - except (ValueError, sky.exceptions.ClusterNotUpError) as e: + except (ValueError, exceptions.ClusterNotUpError, + exceptions.CommandError) as e: if purge: logger.warning('Ignoring error when stopping controller and ' f'redirector jobs of service {service_name}: {e}') diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 75f228ab47a..9c73463b02f 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -8,6 +8,7 @@ from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import load_terminate_service_result +from sky.serve.serve_utils import SERVE_PREHOOK_COMMANDS from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceHandle from sky.serve.service_spec import SkyServiceSpec diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 567cdec6be8..10468bc1e5a 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -9,7 +9,9 @@ CONTROLLER_PORT = 31001 CONTROLLER_SYNC_INTERVAL = 20 -CONTROLLER_RESOURCES = {'disk_size': 100, 'cpus': '4+'} +# We need 200GB disk space to enable using Azure as controller, since its image +# size is 150GB. +CONTROLLER_RESOURCES = {'disk_size': 200, 'cpus': '4+'} # A period of time to initialize your service. Any readiness probe failures # during this period will be ignored. diff --git a/sky/serve/examples/http_server/server.py b/sky/serve/examples/http_server/server.py index 303b117d26d..967e8bd73d8 100644 --- a/sky/serve/examples/http_server/server.py +++ b/sky/serve/examples/http_server/server.py @@ -1,8 +1,7 @@ +import argparse import http.server import socketserver -PORT = 8081 - class MyHttpRequestHandler(http.server.SimpleHTTPRequestHandler): @@ -26,8 +25,12 @@ def do_GET(self): return -Handler = MyHttpRequestHandler +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='SkyServe HTTP Test Server') + parser.add_argument('--port', type=int, required=False, default=8081) + args = parser.parse_args() -with socketserver.TCPServer(("", PORT), Handler) as httpd: - print("serving at port", PORT) - httpd.serve_forever() + Handler = MyHttpRequestHandler + with socketserver.TCPServer(("", args.port), Handler) as httpd: + print("serving at port", args.port) + httpd.serve_forever() diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 88ff2d71860..7955beb59a0 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -21,6 +21,17 @@ if typing.TYPE_CHECKING: import sky +# A series of pre-hook commands that will be insert to the beginning of each +# serve-related task, Including controller and replcias. +# Shutdown jupyter service that is default enabled on our GCP Deep +# Learning Image. This is to avoid port conflict on 8080. +# Shutdown jupyterhub service that is default enabled on our Azure Deep +# Learning Image. This is to avoid port conflict on 8081. +SERVE_PREHOOK_COMMANDS = """\ +sudo systemctl stop jupyter > /dev/null 2>&1 || true +sudo systemctl stop jupyterhub > /dev/null 2>&1 || true +""" + _CONTROLLER_URL = f'http://localhost:{constants.CONTROLLER_PORT}' _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*' diff --git a/sky/task.py b/sky/task.py index 99f029d97ea..5051bd6c601 100644 --- a/sky/task.py +++ b/sky/task.py @@ -800,6 +800,13 @@ def get_preferred_store_type(self) -> storage_lib.StoreType: store_type = storage_lib.get_storetype_from_cloud(storage_cloud) return store_type + def add_skyserve_prehook(self) -> None: + """(INTERNAL) Add prehook functions for skyserve task.""" + if self.setup is None: + self.setup = '' + assert isinstance(self.setup, str) + self.setup = serve_lib.SERVE_PREHOOK_COMMANDS + self.setup + def sync_storage_mounts(self) -> None: """(INTERNAL) Eagerly syncs storage mounts to cloud storage. diff --git a/sky/templates/skyserve-controller.yaml.j2 b/sky/templates/skyserve-controller.yaml.j2 index 230c515b585..f9965a187d1 100644 --- a/sky/templates/skyserve-controller.yaml.j2 +++ b/sky/templates/skyserve-controller.yaml.j2 @@ -1,12 +1,14 @@ # The template for skyserve controller setup: | - # Install all serve dependencies. - pip install skypilot[serve] > /dev/null 2>&1 + # Install all cloud dependencies. + # This is for multicloud support. To allow controller launch on all clouds, + # we need to install all cloud dependencies. + # This also includes all serve dependencies. + pip install skypilot[all] > /dev/null 2>&1 - # Shutdown jupyter service that is default enabled on our GCP Deep - # Learning Image. This is to avoid port conflict on 8080. - sudo systemctl stop jupyter > /dev/null 2>&1 || true + # Install gcloud CLI. + {{google_sdk_installation_commands}} file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} diff --git a/tests/skyserve/http_aws.yaml b/tests/skyserve/http_aws.yaml new file mode 100644 index 00000000000..512bd5dc007 --- /dev/null +++ b/tests/skyserve/http_aws.yaml @@ -0,0 +1,16 @@ +resources: + cloud: aws + cpus: 2+ + +workdir: sky/serve/examples/http_server + +run: python3 server.py + +service: + port: 8081 + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 2 + controller_resources: + cloud: aws diff --git a/tests/skyserve/http_azure.yaml b/tests/skyserve/http_azure.yaml new file mode 100644 index 00000000000..b4a16442e59 --- /dev/null +++ b/tests/skyserve/http_azure.yaml @@ -0,0 +1,19 @@ +resources: + cloud: azure + cpus: 2+ + +workdir: sky/serve/examples/http_server + +# Default Azure image have a jupyterhub running on 8081, which will be +# terminated by skyserve prehook command. Here we still use 8081 to test +# our prehook commands normally execute +run: python3 server.py --port 8081 + +service: + port: 8081 + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 2 + controller_resources: + cloud: azure diff --git a/tests/skyserve/http_gcp.yaml b/tests/skyserve/http_gcp.yaml new file mode 100644 index 00000000000..94ffcbe434c --- /dev/null +++ b/tests/skyserve/http_gcp.yaml @@ -0,0 +1,17 @@ +resources: + cloud: gcp + cpus: 2+ + +workdir: sky/serve/examples/http_server + +# Use 8080 to test our prehook commands normally execute +run: python3 server.py --port 8080 + +service: + port: 8080 + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 2 + controller_resources: + cloud: gcp diff --git a/tests/skyserve/http_mixed_cloud.yaml b/tests/skyserve/http_mixed_cloud.yaml new file mode 100644 index 00000000000..81f65ad869a --- /dev/null +++ b/tests/skyserve/http_mixed_cloud.yaml @@ -0,0 +1,16 @@ +resources: + cloud: gcp + cpus: 2+ + +workdir: sky/serve/examples/http_server + +run: python3 server.py + +service: + port: 8081 + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 2 + controller_resources: + cloud: aws diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 25eb5181abb..8794e8927b1 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -180,7 +180,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: test.teardown, stdout=log_file, stderr=subprocess.STDOUT, - timeout=10 * 60, # 10 mins + timeout=15 * 60, # 15 mins shell=True, ) @@ -2603,6 +2603,57 @@ def test_gcp_zero_quota_failover(): run_one_test(test) +# ---------- Testing skyserve ---------- + + +def _get_skyserve_test_task(name: str, suffix: str, timeout_minutes: int) -> Test: + url_regex = r'([0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]{1,5}' + test = Test( + f'test-skyserve-{suffix.replace("_", "-")}', + [ + f'sky serve up -n {name} -y tests/skyserve/http_{suffix}.yaml', + f'(while true; do output=$(sky serve status {name}); echo "$output" | grep -q "2/2" && break; echo "$output" | grep -q "FAILED" && exit 1; sleep 10; done)', + f'url=$(sky serve status {name} | grep -Eo "{url_regex}"); curl -L http://$url | grep "Hi, SkyPilot here"', + ], + f'sky serve down -y {name}', + timeout=timeout_minutes * 60, + ) + return test + + +@pytest.mark.gcp +def test_skyserve_gcp(): + """Test skyserve on GCP""" + name = _get_cluster_name() + test = _get_skyserve_test_task(name, 'gcp', 20) + run_one_test(test) + + +@pytest.mark.aws +def test_skyserve_aws(): + """Test skyserve on AWS""" + name = _get_cluster_name() + test = _get_skyserve_test_task(name, 'aws', 20) + run_one_test(test) + + +@pytest.mark.azure +def test_skyserve_azure(): + """Test skyserve on Azure""" + name = _get_cluster_name() + test = _get_skyserve_test_task(name, 'azure', 30) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.aws +def test_skyserve_mixed_cloud(): + """Test skyserve on mixed cloud""" + name = _get_cluster_name() + test = _get_skyserve_test_task(name, 'mixed_cloud', 20) + run_one_test(test) + + # ------- Testing user ray cluster -------- @pytest.mark.no_kubernetes # Kubernetes does not support sky status -r yet. def test_user_ray_cluster(generic_cloud: str): From f7f33e9c5561ae2f9d6bbac6b7344567119fce93 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sat, 19 Aug 2023 18:27:02 -0700 Subject: [PATCH 015/223] [SkyServe] Add cancel and gorilla example (#2417) * add cancel and gorilla example * update yaml & add readme * add CLI request cancel * Update sky/serve/examples/gorilla/gorilla.yaml Co-authored-by: Wei-Lin Chiang * Update sky/serve/examples/misc/cancel/service.yaml Co-authored-by: Wei-Lin Chiang * advice from code review * upd fschat installation --------- Co-authored-by: Wei-Lin Chiang --- sky/serve/examples/gorilla/gorilla.yaml | 39 +++++++++++++++++++ sky/serve/examples/gorilla/run_gorilla.py | 36 +++++++++++++++++ sky/serve/examples/misc/cancel/README.md | 39 +++++++++++++++++++ .../misc/cancel/send_cancel_request.py | 31 +++++++++++++++ sky/serve/examples/misc/cancel/server.py | 33 ++++++++++++++++ sky/serve/examples/misc/cancel/service.yaml | 14 +++++++ 6 files changed, 192 insertions(+) create mode 100644 sky/serve/examples/gorilla/gorilla.yaml create mode 100644 sky/serve/examples/gorilla/run_gorilla.py create mode 100644 sky/serve/examples/misc/cancel/README.md create mode 100644 sky/serve/examples/misc/cancel/send_cancel_request.py create mode 100644 sky/serve/examples/misc/cancel/server.py create mode 100644 sky/serve/examples/misc/cancel/service.yaml diff --git a/sky/serve/examples/gorilla/gorilla.yaml b/sky/serve/examples/gorilla/gorilla.yaml new file mode 100644 index 00000000000..b08abb4b2c8 --- /dev/null +++ b/sky/serve/examples/gorilla/gorilla.yaml @@ -0,0 +1,39 @@ +resources: + accelerators: A100:1 + disk_size: 1024 + disk_tier: high + +service: + port: 8087 + readiness_probe: + path: /v1/models + initial_delay_seconds: 1800 + replicas: 2 + +setup: | + conda activate chatbot + if [ $? -ne 0 ]; then + conda create -n chatbot python=3.9 -y + conda activate chatbot + fi + + # Install dependencies + pip install fschat[model_worker,webui]==0.2.24 + pip install protobuf einops + +run: | + conda activate chatbot + + echo 'Starting controller...' + python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + sleep 10 + echo 'Starting model worker...' + python -u -m fastchat.serve.model_worker \ + --model-path gorilla-llm/gorilla-mpt-7b-hf-v0 2>&1 \ + | tee model_worker.log & + + echo 'Waiting for model worker to start...' + while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting openai api server server...' + python -u -m fastchat.serve.openai_api_server --host 0.0.0.0 --port 8087 | tee ~/openai_api_server.log diff --git a/sky/serve/examples/gorilla/run_gorilla.py b/sky/serve/examples/gorilla/run_gorilla.py new file mode 100644 index 00000000000..eee74f37894 --- /dev/null +++ b/sky/serve/examples/gorilla/run_gorilla.py @@ -0,0 +1,36 @@ +# Code is borrowed from gorilla's colab +# https://colab.research.google.com/drive/1DEBPsccVLF_aUnmD0FwPeHFrtdC0QIUP?usp=sharing # pylint: disable=line-too-long + +import openai +import urllib.parse + +openai.api_key = "EMPTY" # Key is ignored and does not matter +# SkyServe endpoint +endpoint = input("Enter SkyServe endpoint: ") +# endpoint = '34.132.127.197:8000' +openai.api_base = f"http://{endpoint}/v1" + +# Report issues +def raise_issue(e, model, prompt): + issue_title = urllib.parse.quote("[bug] Hosted Gorilla: ") + issue_body = urllib.parse.quote(f"Exception: {e}\nFailed model: {model}, for prompt: {prompt}") + issue_url = f"https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-gorilla&projects=&template=hosted-gorilla-.md&title={issue_title}&body={issue_body}" + print(f"An exception has occurred: {e} \nPlease raise an issue here: {issue_url}") + +# Query Gorilla server +def get_gorilla_response(prompt, model="gorilla-mpt-7b-hf-v0"): + try: + completion = openai.ChatCompletion.create( + model=model, + messages=[{"role": "user", "content": prompt}] + ) + return completion.choices[0].message.content + except Exception as e: + raise_issue(e, model, prompt) + + +prompt = "I would like to translate 'I feel very good today.' from English to Chinese." +print(get_gorilla_response(prompt)) + +prompt = "I want to build a robot that can detecting objects in an image ‘cat.jpeg’. Input: [‘cat.jpeg’]" +print(get_gorilla_response(prompt)) diff --git a/sky/serve/examples/misc/cancel/README.md b/sky/serve/examples/misc/cancel/README.md new file mode 100644 index 00000000000..cc76b6b148c --- /dev/null +++ b/sky/serve/examples/misc/cancel/README.md @@ -0,0 +1,39 @@ +# SkyServe cancel example + +This example demonstrates the redirect support canceling a request. + +## Running the example + +Under skypilot root directory, run the following command: + +```bash +sky serve up sky/serve/examples/misc/cancel/service.yaml -n skyserve-cancel-test +``` + +Use `sky serve status` to monitor the status of the service. When its ready, run + +```bash +sky serve logs skyserve-cancel-test 1 +``` + +to monitor the logs of the service. Run + +```bash +python3 sky/serve/examples/misc/cancel/send_cancel_request.py +``` + +and enter the endpoint output by `sky serve status`. You should see the following output: + +```bash +Computing... step 0 +Computing... step 1 +Client disconnected, stopping computation. +``` + +You can also run + +```bash +curl -L http:/// +``` + +and manually Ctrl + C to cancel the request and see logs. diff --git a/sky/serve/examples/misc/cancel/send_cancel_request.py b/sky/serve/examples/misc/cancel/send_cancel_request.py new file mode 100644 index 00000000000..0548a5da451 --- /dev/null +++ b/sky/serve/examples/misc/cancel/send_cancel_request.py @@ -0,0 +1,31 @@ +import aiohttp +import asyncio + +redirector_endpoint = input('Enter redirector endpoint: ') + +async def fetch(session, url): + try: + async with session.get(url) as response: + print("Got response!") + return await response.text() + except asyncio.CancelledError: + print("Request was cancelled!") + raise + +async def main(): + timeout = 2 + + async with aiohttp.ClientSession() as session: + task = asyncio.create_task(fetch(session, f'http://{redirector_endpoint}/')) + + await asyncio.sleep(timeout) + # We manually cancel requests for test purposes. + # You could also manually Ctrl + C a curl to cancel a request. + task.cancel() + + try: + await task + except asyncio.CancelledError: + print("Main function caught the cancelled exception.") + +asyncio.run(main()) diff --git a/sky/serve/examples/misc/cancel/server.py b/sky/serve/examples/misc/cancel/server.py new file mode 100644 index 00000000000..242809517f7 --- /dev/null +++ b/sky/serve/examples/misc/cancel/server.py @@ -0,0 +1,33 @@ +from aiohttp import web +import argparse +import asyncio + +async def handle(request): + response = web.StreamResponse() + await response.prepare(request) + + try: + # Simulate a computation that takes 10 seconds + for i in range(10): + print("Computing... step", i) + await asyncio.sleep(1) + await response.write(b' ') # Sending a space as a heartbeat + await response.write(b'Completed after 10 seconds.') + except (asyncio.CancelledError, ConnectionResetError): + print("Client disconnected, stopping computation.") + return response + + return response + +async def health_check(request): + print("Received health check") + return web.Response(text="Healthy") + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='SkyServe HTTP Test Server') + parser.add_argument('--port', type=int, required=False, default=8081) + args = parser.parse_args() + app = web.Application() + app.router.add_get('/health', health_check) + app.router.add_get('/', handle) + web.run_app(app, host='0.0.0.0', port=args.port) diff --git a/sky/serve/examples/misc/cancel/service.yaml b/sky/serve/examples/misc/cancel/service.yaml new file mode 100644 index 00000000000..0df1640f290 --- /dev/null +++ b/sky/serve/examples/misc/cancel/service.yaml @@ -0,0 +1,14 @@ +service: + port: 9000 + readiness_probe: + path: /health + initial_delay_seconds: 120 + controller_resources: + cpus: 2+ + +resources: + cpus: 2+ + +workdir: sky/serve/examples/misc/cancel + +run: python3 server.py --port 9000 From 8216e71ae027b0ac45b0c31502939ddbafc901af Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Wed, 23 Aug 2023 23:11:37 -0700 Subject: [PATCH 016/223] [SkyServe] Fix interrupt process group and format (#2449) * fix * format * update skyserve prompt * resolve comments * fix vllm --- sky/cli.py | 14 +++++++---- sky/serve/examples/gorilla/run_gorilla.py | 23 ++++++++++++------- .../misc/cancel/send_cancel_request.py | 9 ++++++-- sky/serve/examples/misc/cancel/server.py | 6 ++++- sky/serve/examples/vllm.yaml | 2 +- sky/serve/infra_providers.py | 19 +++++++++++++-- tests/test_smoke.py | 3 ++- 7 files changed, 57 insertions(+), 19 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 51c10ee92b5..d324cb8d00f 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2108,10 +2108,16 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa arg_str = '--all' else: arg_str = ' '.join(map(str, jobs)) - error_str = ('Cancelling the spot controller\'s jobs is not allowed.' - f'\nTo cancel spot jobs, use: sky spot cancel [--all]' - f'\nDo you mean: {bold}sky spot cancel {arg_str}{reset}') + if cluster == spot_lib.SPOT_CONTROLLER_NAME: + error_str = ( + 'Cancelling the spot controller\'s jobs is not allowed.' + f'\nTo cancel spot jobs, use: sky spot cancel [--all]' + f'\nDo you mean: {bold}sky spot cancel {arg_str}{reset}') + else: + assert cluster.startswith(serve_lib.CONTROLLER_PREFIX) + error_str = ( + 'Cancelling the sky serve controller\'s jobs is not allowed.') click.echo(error_str) sys.exit(1) except ValueError as e: diff --git a/sky/serve/examples/gorilla/run_gorilla.py b/sky/serve/examples/gorilla/run_gorilla.py index eee74f37894..ce5199a5434 100644 --- a/sky/serve/examples/gorilla/run_gorilla.py +++ b/sky/serve/examples/gorilla/run_gorilla.py @@ -1,29 +1,36 @@ # Code is borrowed from gorilla's colab # https://colab.research.google.com/drive/1DEBPsccVLF_aUnmD0FwPeHFrtdC0QIUP?usp=sharing # pylint: disable=line-too-long -import openai import urllib.parse -openai.api_key = "EMPTY" # Key is ignored and does not matter +import openai + +openai.api_key = "EMPTY" # Key is ignored and does not matter # SkyServe endpoint endpoint = input("Enter SkyServe endpoint: ") # endpoint = '34.132.127.197:8000' openai.api_base = f"http://{endpoint}/v1" + # Report issues def raise_issue(e, model, prompt): issue_title = urllib.parse.quote("[bug] Hosted Gorilla: ") - issue_body = urllib.parse.quote(f"Exception: {e}\nFailed model: {model}, for prompt: {prompt}") + issue_body = urllib.parse.quote( + f"Exception: {e}\nFailed model: {model}, for prompt: {prompt}") issue_url = f"https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-gorilla&projects=&template=hosted-gorilla-.md&title={issue_title}&body={issue_body}" - print(f"An exception has occurred: {e} \nPlease raise an issue here: {issue_url}") + print( + f"An exception has occurred: {e} \nPlease raise an issue here: {issue_url}" + ) + # Query Gorilla server def get_gorilla_response(prompt, model="gorilla-mpt-7b-hf-v0"): try: - completion = openai.ChatCompletion.create( - model=model, - messages=[{"role": "user", "content": prompt}] - ) + completion = openai.ChatCompletion.create(model=model, + messages=[{ + "role": "user", + "content": prompt + }]) return completion.choices[0].message.content except Exception as e: raise_issue(e, model, prompt) diff --git a/sky/serve/examples/misc/cancel/send_cancel_request.py b/sky/serve/examples/misc/cancel/send_cancel_request.py index 0548a5da451..76f41f43baf 100644 --- a/sky/serve/examples/misc/cancel/send_cancel_request.py +++ b/sky/serve/examples/misc/cancel/send_cancel_request.py @@ -1,8 +1,10 @@ -import aiohttp import asyncio +import aiohttp + redirector_endpoint = input('Enter redirector endpoint: ') + async def fetch(session, url): try: async with session.get(url) as response: @@ -12,11 +14,13 @@ async def fetch(session, url): print("Request was cancelled!") raise + async def main(): timeout = 2 async with aiohttp.ClientSession() as session: - task = asyncio.create_task(fetch(session, f'http://{redirector_endpoint}/')) + task = asyncio.create_task( + fetch(session, f'http://{redirector_endpoint}/')) await asyncio.sleep(timeout) # We manually cancel requests for test purposes. @@ -28,4 +32,5 @@ async def main(): except asyncio.CancelledError: print("Main function caught the cancelled exception.") + asyncio.run(main()) diff --git a/sky/serve/examples/misc/cancel/server.py b/sky/serve/examples/misc/cancel/server.py index 242809517f7..dd3491f3a27 100644 --- a/sky/serve/examples/misc/cancel/server.py +++ b/sky/serve/examples/misc/cancel/server.py @@ -1,7 +1,9 @@ -from aiohttp import web import argparse import asyncio +from aiohttp import web + + async def handle(request): response = web.StreamResponse() await response.prepare(request) @@ -19,10 +21,12 @@ async def handle(request): return response + async def health_check(request): print("Received health check") return web.Response(text="Healthy") + if __name__ == '__main__': parser = argparse.ArgumentParser(description='SkyServe HTTP Test Server') parser.add_argument('--port', type=int, required=False, default=8081) diff --git a/sky/serve/examples/vllm.yaml b/sky/serve/examples/vllm.yaml index 7ae83534a69..7fa784f075f 100644 --- a/sky/serve/examples/vllm.yaml +++ b/sky/serve/examples/vllm.yaml @@ -17,7 +17,7 @@ setup: | # Setup the environment conda create -n chatbot python=3.10 -y conda activate chatbot - pip install pip install git+https://github.com/lm-sys/FastChat.git + pip install git+https://github.com/lm-sys/FastChat.git pip install vllm pip install accelerate fi diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index b00a5d388e3..84ec119aaf5 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -2,7 +2,6 @@ from concurrent import futures import enum import logging -import os import random import signal import subprocess @@ -10,6 +9,7 @@ import time from typing import Any, Dict, List, Optional, Set, Tuple, Union +import psutil import requests from sky import backends @@ -31,6 +31,19 @@ _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT // _ENDPOINT_PROBE_INTERVAL) +def _interrupt_process_and_children(pid: int) -> None: + parent_process = psutil.Process(pid) + for child_process in parent_process.children(recursive=True): + try: + child_process.send_signal(signal.SIGINT) + except psutil.NoSuchProcess: + pass + try: + parent_process.send_signal(signal.SIGINT) + except psutil.NoSuchProcess: + pass + + class ProcessStatus(enum.Enum): """Process status.""" @@ -485,7 +498,9 @@ def terminate(self) -> Optional[str]: # process_pool_refresher terminates if p.poll() is None: assert p.pid is not None - os.killpg(os.getpgid(p.pid), signal.SIGINT) + # Interrupt the launch process and its children. We use SIGINT + # here since sky.launch has great handling for it. + _interrupt_process_and_children(p.pid) p.wait() logger.info(f'Interrupted launch process for cluster {name} ' 'and deleted the cluster.') diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8794e8927b1..fe4b3d934fc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2606,7 +2606,8 @@ def test_gcp_zero_quota_failover(): # ---------- Testing skyserve ---------- -def _get_skyserve_test_task(name: str, suffix: str, timeout_minutes: int) -> Test: +def _get_skyserve_test_task(name: str, suffix: str, + timeout_minutes: int) -> Test: url_regex = r'([0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]{1,5}' test = Test( f'test-skyserve-{suffix.replace("_", "-")}', From d8aaf65ff38cb3db5d9c87825eb841ff2284ae17 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 24 Aug 2023 13:08:30 -0700 Subject: [PATCH 017/223] add authentication init --- sky/serve/controller.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 3bdf8d3592a..963ce01dcc3 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -11,6 +11,7 @@ import fastapi import uvicorn +from sky import authentication from sky import serve from sky import sky_logging from sky.serve import autoscalers @@ -22,6 +23,9 @@ # to inherit the setup from the `sky` logger. logger = sky_logging.init_logger('sky.serve.controller') +# Generate ssh key pair to avoid race condition when multiple sky.launch +# are executed at the same time. +authentication.get_or_generate_keys() class SuppressSuccessGetAccessLogsFilter(logging.Filter): From ede0cad9d730f5bd8f9e2bfa7d7daee1e2799e35 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 24 Aug 2023 13:39:35 -0700 Subject: [PATCH 018/223] typo fix & remove prehook --- sky/backends/backend_utils.py | 2 +- sky/cli.py | 2 +- sky/core.py | 2 +- sky/execution.py | 11 ++++------- sky/serve/README.md | 2 +- sky/serve/__init__.py | 1 - sky/serve/autoscalers.py | 8 ++++---- sky/serve/controller.py | 3 ++- sky/serve/examples/llama2/chat.py | 2 +- sky/serve/serve_utils.py | 17 +++-------------- sky/status_lib.py | 4 ++-- sky/task.py | 7 ------- tests/skyserve/http_azure.yaml | 4 +--- tests/skyserve/http_gcp.yaml | 2 +- 14 files changed, 22 insertions(+), 45 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index ac153fc4d8b..f7978e9459f 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2634,7 +2634,7 @@ def _refresh_service_record_no_lock( if (cluster_record is None or cluster_record['status'] != status_lib.ClusterStatus.UP): global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) return record, (f'Controller cluster {controller_cluster_name!r} ' 'is not found or UP.') diff --git a/sky/cli.py b/sky/cli.py index e534666dc3b..65406b4d2ec 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3994,7 +3994,7 @@ def serve_up( service_name) if previous_service_record is not None: if previous_service_record['status'] in [ - status_lib.ServiceStatus.CONTRLLER_FAILED, + status_lib.ServiceStatus.CONTROLLER_FAILED, status_lib.ServiceStatus.FAILED ]: prompt = (f'Service {service_name!r} has failed. ' diff --git a/sky/core.py b/sky/core.py index 3f8e4b4186d..ebfd60f3383 100644 --- a/sky/core.py +++ b/sky/core.py @@ -123,7 +123,7 @@ def serve_tail_logs(service_record: Dict[str, Any], replica_id: int, raise ValueError( f'Service {service_name!r} is still initializing its ' 'controller. Please try again later.') - if service_record['status'] == status_lib.ServiceStatus.CONTRLLER_FAILED: + if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service {service_name!r}\'s controller failed. ' 'Cannot tail logs.') diff --git a/sky/execution.py b/sky/execution.py index 585f6abb74f..07d47736870 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -998,8 +998,6 @@ def serve_up( service_handle.ephemeral_storage = ephemeral_storage global_user_state.set_service_handle(service_name, service_handle) - task.add_skyserve_prehook() - with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', mode='w') as f: task_config = task.to_yaml_config() @@ -1021,7 +1019,6 @@ def serve_up( vars_to_fill, output_path=controller_yaml_path) controller_task = task_lib.Task.from_yaml(controller_yaml_path) - controller_task.add_skyserve_prehook() controller_task.best_resources = controller_best_resources controller_envs = { @@ -1049,7 +1046,7 @@ def serve_up( if (cluster_record is None or cluster_record['status'] != status_lib.ClusterStatus.UP): global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) print(f'{colorama.Fore.RED}Controller failed to launch. ' f'Please check the logs above.{colorama.Style.RESET_ALL}') return @@ -1096,7 +1093,7 @@ def _wait_until_job_is_running(cluster_name: str, controller_cluster_name, 1) if not controller_job_is_running: global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) print(f'{colorama.Fore.RED}Controller failed to launch. ' f'Please check the logs with sky serve logs {service_name} ' f'--controller{colorama.Style.RESET_ALL}') @@ -1123,7 +1120,7 @@ def _wait_until_job_is_running(cluster_name: str, controller_cluster_name, 2) if not redirector_job_is_running: global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTRLLER_FAILED) + service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) print(f'{colorama.Fore.RED}Redirector failed to launch. ' f'Please check the logs with sky serve logs {service_name} ' f'--redirector{colorama.Style.RESET_ALL}') @@ -1176,7 +1173,7 @@ def serve_down( purge: If true, ignore errors when cleaning up the controller. """ service_record = global_user_state.get_service_from_name(service_name) - # Already filered all inexist service in cli.py + # Already filtered all inexistent service in cli.py assert service_record is not None, service_name controller_cluster_name = service_record['handle'].controller_cluster_name global_user_state.set_service_status(service_name, diff --git a/sky/serve/README.md b/sky/serve/README.md index 41578e854ff..4d86bbf4bbd 100644 --- a/sky/serve/README.md +++ b/sky/serve/README.md @@ -9,7 +9,7 @@ Sky Serve transparently handles load balancing, failover and autoscaling of the ## Architecture Sky Serve has four key components: -1. Redirector - The HTTP server is responsible for recieving requests and redirecting them to healthy endpoints. +1. Redirector - The HTTP server is responsible for receiving requests and redirecting them to healthy endpoints. 2. Load balancers - spread requests across healthy endpoints according to different policies. 3. Autoscalers - scale up and down the number of serving endpoints according to different policies and handle recovery of unhealthy endpoints. 4. Infra Providers - provides a uniform interface to talk to SkyPilot. diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 9c73463b02f..75f228ab47a 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -8,7 +8,6 @@ from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import load_terminate_service_result -from sky.serve.serve_utils import SERVE_PREHOOK_COMMANDS from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceHandle from sky.serve.service_spec import SkyServiceSpec diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 33d44e113ec..6bd4ab74bc3 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -12,7 +12,7 @@ # Since sky.launch is very resource demanding, we limit the number of # concurrent sky.launch process to avoid overloading the machine. # TODO(tian): determine this value based on controller resources. -_MAX_BOOTSTRAPING_NUM = 5 +_MAX_BOOTSTRAPPING_NUM = 5 class Autoscaler: @@ -78,7 +78,7 @@ def __init__(self, *args, upper_threshold: Optional[float], super().__init__(*args, **kwargs) # Cooldown between two scaling operations in seconds. self.cooldown: int = cooldown - # Quesy interval for requests num. Every `query_interval` seconds, + # Query interval for requests num. Every `query_interval` seconds, # Autoscaler will received an update for number of requests from # redirector. self.query_interval: int = query_interval @@ -125,8 +125,8 @@ def evaluate_scaling(self) -> None: logger.info(f'Number of nodes: {num_nodes}') if num_nodes < self.min_nodes: logger.info('Bootstrapping service.') - self.scale_up(min(self.min_nodes - num_nodes, - _MAX_BOOTSTRAPING_NUM)) + self.scale_up( + min(self.min_nodes - num_nodes, _MAX_BOOTSTRAPPING_NUM)) self.last_scale_operation = current_time elif (self.upper_threshold is not None and requests_per_node > self.upper_threshold): diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 963ce01dcc3..9493808908b 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -27,6 +27,7 @@ # are executed at the same time. authentication.get_or_generate_keys() + class SuppressSuccessGetAccessLogsFilter(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: @@ -104,7 +105,7 @@ def terminate(request: fastapi.Request): if self.autoscaler is not None: self.autoscaler.start() - # Disable all GET logs if SKYPILOT_DEBUG is not set to avoid overflood + # Disable all GET logs if SKYPILOT_DEBUG is not set to avoid overflowing # the controller logs. if not env_options.Options.SHOW_DEBUG_INFO.get(): logging.getLogger('uvicorn.access').addFilter( diff --git a/sky/serve/examples/llama2/chat.py b/sky/serve/examples/llama2/chat.py index 2e353da7268..ba2f59658f5 100644 --- a/sky/serve/examples/llama2/chat.py +++ b/sky/serve/examples/llama2/chat.py @@ -5,7 +5,7 @@ stream = True model = "Llama-2-7b-chat-hf" -init_prompt = "You are a helful assistant." +init_prompt = "You are a helpful assistant." history = [{"role": "system", "content": init_prompt}] endpoint = input("Endpoint: ") url = f"http://{endpoint}/v1/chat/completions" diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 7955beb59a0..4907b52e734 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -21,17 +21,6 @@ if typing.TYPE_CHECKING: import sky -# A series of pre-hook commands that will be insert to the beginning of each -# serve-related task, Including controller and replcias. -# Shutdown jupyter service that is default enabled on our GCP Deep -# Learning Image. This is to avoid port conflict on 8080. -# Shutdown jupyterhub service that is default enabled on our Azure Deep -# Learning Image. This is to avoid port conflict on 8081. -SERVE_PREHOOK_COMMANDS = """\ -sudo systemctl stop jupyter > /dev/null 2>&1 || true -sudo systemctl stop jupyterhub > /dev/null 2>&1 || true -""" - _CONTROLLER_URL = f'http://localhost:{constants.CONTROLLER_PORT}' _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*' @@ -88,12 +77,12 @@ class ServiceHandle(object): """A pickle-able tuple of: - (required) Controller cluster name. - - (required) Service autoscaling policy descriotion str. + - (required) Service autoscaling policy description str. - (required) Service requested resources. - (required) All replica info. - (optional) Service uptime. - (optional) Service endpoint URL. - - (optional) Epemeral storage generated for the service. + - (optional) Ephemeral storage generated for the service. This class is only used as a cache for information fetched from controller. """ @@ -135,7 +124,7 @@ def cleanup_ephemeral_storage(self) -> None: storage = storage_lib.Storage.from_yaml_config(storage_config) storage.delete(silent=True) - def __setsate__(self, state): + def __setstate__(self, state): self._version = self._VERSION self.__dict__.update(state) diff --git a/sky/status_lib.py b/sky/status_lib.py index 78aa53447fa..dc0d2f080eb 100644 --- a/sky/status_lib.py +++ b/sky/status_lib.py @@ -62,7 +62,7 @@ class ServiceStatus(enum.Enum): # Controller failed to initialize / controller or redirector process # status abnormal - CONTRLLER_FAILED = 'CONTROLLER_FAILED' + CONTROLLER_FAILED = 'CONTROLLER_FAILED' # At least one replica is ready READY = 'READY' @@ -81,7 +81,7 @@ def colored_str(self): _SERVICE_STATUS_TO_COLOR = { ServiceStatus.CONTROLLER_INIT: colorama.Fore.BLUE, ServiceStatus.REPLICA_INIT: colorama.Fore.BLUE, - ServiceStatus.CONTRLLER_FAILED: colorama.Fore.RED, + ServiceStatus.CONTROLLER_FAILED: colorama.Fore.RED, ServiceStatus.READY: colorama.Fore.GREEN, ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, ServiceStatus.FAILED: colorama.Fore.RED, diff --git a/sky/task.py b/sky/task.py index 2c28423ef45..c919fca7ad9 100644 --- a/sky/task.py +++ b/sky/task.py @@ -837,13 +837,6 @@ def get_preferred_store_type(self) -> storage_lib.StoreType: store_type = storage_lib.get_storetype_from_cloud(storage_cloud) return store_type - def add_skyserve_prehook(self) -> None: - """(INTERNAL) Add prehook functions for skyserve task.""" - if self.setup is None: - self.setup = '' - assert isinstance(self.setup, str) - self.setup = serve_lib.SERVE_PREHOOK_COMMANDS + self.setup - def sync_storage_mounts(self) -> None: """(INTERNAL) Eagerly syncs storage mounts to cloud storage. diff --git a/tests/skyserve/http_azure.yaml b/tests/skyserve/http_azure.yaml index b4a16442e59..7bf5ac927ba 100644 --- a/tests/skyserve/http_azure.yaml +++ b/tests/skyserve/http_azure.yaml @@ -4,9 +4,7 @@ resources: workdir: sky/serve/examples/http_server -# Default Azure image have a jupyterhub running on 8081, which will be -# terminated by skyserve prehook command. Here we still use 8081 to test -# our prehook commands normally execute +# Use 8080 to test jupyterhub service is terminated run: python3 server.py --port 8081 service: diff --git a/tests/skyserve/http_gcp.yaml b/tests/skyserve/http_gcp.yaml index 94ffcbe434c..08ce7899631 100644 --- a/tests/skyserve/http_gcp.yaml +++ b/tests/skyserve/http_gcp.yaml @@ -4,7 +4,7 @@ resources: workdir: sky/serve/examples/http_server -# Use 8080 to test our prehook commands normally execute +# Use 8080 to test jupyter service is terminated run: python3 server.py --port 8080 service: From 4ace06107c445d0d83314d516ee19366b106913f Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 24 Aug 2023 14:33:01 -0700 Subject: [PATCH 019/223] fix --no-follow in replica log and disable cancel log when skyserve down --- sky/backends/cloud_vm_ray_backend.py | 7 ++++++- sky/core.py | 2 +- sky/serve/serve_utils.py | 12 ++++++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 20d75f1c657..acaa003563d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3438,7 +3438,8 @@ def get_job_status( def cancel_jobs(self, handle: CloudVmRayResourceHandle, jobs: Optional[List[int]], - cancel_all: bool = False) -> None: + cancel_all: bool = False, + silent: bool = False) -> None: """Cancels jobs. CloudVMRayBackend specific method. @@ -3470,6 +3471,10 @@ def cancel_jobs(self, f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout) cancelled_ids = common_utils.decode_payload(stdout) + + if silent: + return + if cancelled_ids: logger.info( f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}') diff --git a/sky/core.py b/sky/core.py index ebfd60f3383..22058d730b7 100644 --- a/sky/core.py +++ b/sky/core.py @@ -641,7 +641,7 @@ def cancel( # all = False, len(job_ids) == 0 => no jobs to cancel. return - backend.cancel_jobs(handle, job_ids, all) + backend.cancel_jobs(handle, job_ids, all, silent=_from_serve_core) @usage_lib.entrypoint diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 4907b52e734..979b6149658 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -160,6 +160,7 @@ def _follow_logs(file: TextIO, cluster_name: str, *, finish_stream: Callable[[], bool], + exit_when_no_new_content: bool = False, no_new_content_timeout: Optional[int] = None) -> Iterator[str]: line = '' log_file = None @@ -201,12 +202,14 @@ def cluster_is_up() -> bool: for l in _follow_logs(f, cluster_name, finish_stream=cluster_is_up, + exit_when_no_new_content= + exit_when_no_new_content, no_new_content_timeout=10): yield l log_file = None line = '' else: - if finish_stream(): + if exit_when_no_new_content or finish_stream(): break if no_new_content_timeout is not None: if no_new_content_cnt >= no_new_content_timeout: @@ -265,12 +268,13 @@ def _get_replica_status() -> status_lib.ReplicaStatus: _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)) return target_info['status'] - finish_stream = (lambda: not follow or _get_replica_status() != status_lib. - ReplicaStatus.PROVISIONING) + finish_stream = ( + lambda: _get_replica_status() != status_lib.ReplicaStatus.PROVISIONING) with open(launch_log_file_name, 'r', newline='') as f: for line in _follow_logs(f, replica_cluster_name, - finish_stream=finish_stream): + finish_stream=finish_stream, + exit_when_no_new_content=not follow): print(line, end='', flush=True) if not follow and _get_replica_status( ) == status_lib.ReplicaStatus.PROVISIONING: From 14ddf2f4d3d95c74d884f6afc8491f2b6a456167 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 01:05:19 -0700 Subject: [PATCH 020/223] set controller task resources for when controller failed to provision best resources --- sky/cli.py | 3 ++- sky/execution.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 65406b4d2ec..0bed3a39731 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4113,7 +4113,8 @@ def serve_up( if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) - sky.serve_up(task, service_name, controller_best_resources) + sky.serve_up(task, service_name, controller_resources, + controller_best_resources) @serve.command('status', cls=_DocumentedCodeCommand) diff --git a/sky/execution.py b/sky/execution.py index 07d47736870..938e52301a7 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -962,6 +962,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): def serve_up( task: 'sky.Task', service_name: str, + controller_resources: 'sky.Resources', controller_best_resources: 'sky.Resources', ): """Spin up a service. @@ -1019,6 +1020,8 @@ def serve_up( vars_to_fill, output_path=controller_yaml_path) controller_task = task_lib.Task.from_yaml(controller_yaml_path) + # This is for the case when the best resources failed to provision. + controller_task.set_resources(controller_resources) controller_task.best_resources = controller_best_resources controller_envs = { From 8ce9c1840285bedf29ef1fb058fdb2ee91dddf22 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 24 Aug 2023 21:11:16 -0700 Subject: [PATCH 021/223] finish llm & interrupt test --- sky/serve/__init__.py | 2 + .../skyserve/{http_aws.yaml => http/aws.yaml} | 0 .../{http_azure.yaml => http/azure.yaml} | 0 .../skyserve/{http_gcp.yaml => http/gcp.yaml} | 0 .../mixed_cloud.yaml} | 0 tests/skyserve/interrupt/server.py | 30 ++++ tests/skyserve/interrupt/service.yaml | 20 +++ tests/skyserve/interrupt/test_round_robin.py | 36 ++++ tests/skyserve/llm/get_response.py | 33 ++++ tests/skyserve/llm/prompt_output.json | 5 + tests/skyserve/llm/service.yaml | 39 ++++ tests/test_smoke.py | 167 ++++++++++++++++-- 12 files changed, 314 insertions(+), 18 deletions(-) rename tests/skyserve/{http_aws.yaml => http/aws.yaml} (100%) rename tests/skyserve/{http_azure.yaml => http/azure.yaml} (100%) rename tests/skyserve/{http_gcp.yaml => http/gcp.yaml} (100%) rename tests/skyserve/{http_mixed_cloud.yaml => http/mixed_cloud.yaml} (100%) create mode 100644 tests/skyserve/interrupt/server.py create mode 100644 tests/skyserve/interrupt/service.yaml create mode 100644 tests/skyserve/interrupt/test_round_robin.py create mode 100644 tests/skyserve/llm/get_response.py create mode 100644 tests/skyserve/llm/prompt_output.json create mode 100644 tests/skyserve/llm/service.yaml diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 75f228ab47a..68bb6a02581 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -2,10 +2,12 @@ from sky.serve.constants import CONTROLLER_PORT from sky.serve.constants import CONTROLLER_PREFIX from sky.serve.constants import CONTROLLER_RESOURCES +from sky.serve.constants import CONTROLLER_SYNC_INTERVAL from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.serve_utils import generate_controller_cluster_name from sky.serve.serve_utils import generate_controller_yaml_file_name from sky.serve.serve_utils import generate_remote_task_yaml_file_name +from sky.serve.serve_utils import generate_replica_cluster_name from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import load_terminate_service_result from sky.serve.serve_utils import ServeCodeGen diff --git a/tests/skyserve/http_aws.yaml b/tests/skyserve/http/aws.yaml similarity index 100% rename from tests/skyserve/http_aws.yaml rename to tests/skyserve/http/aws.yaml diff --git a/tests/skyserve/http_azure.yaml b/tests/skyserve/http/azure.yaml similarity index 100% rename from tests/skyserve/http_azure.yaml rename to tests/skyserve/http/azure.yaml diff --git a/tests/skyserve/http_gcp.yaml b/tests/skyserve/http/gcp.yaml similarity index 100% rename from tests/skyserve/http_gcp.yaml rename to tests/skyserve/http/gcp.yaml diff --git a/tests/skyserve/http_mixed_cloud.yaml b/tests/skyserve/http/mixed_cloud.yaml similarity index 100% rename from tests/skyserve/http_mixed_cloud.yaml rename to tests/skyserve/http/mixed_cloud.yaml diff --git a/tests/skyserve/interrupt/server.py b/tests/skyserve/interrupt/server.py new file mode 100644 index 00000000000..f8378d980be --- /dev/null +++ b/tests/skyserve/interrupt/server.py @@ -0,0 +1,30 @@ +import argparse +import functools + +from fastapi import FastAPI +import requests +import uvicorn + +app = FastAPI() + + +@functools.lru_cache(maxsize=1) +def get_self_ip() -> str: + return requests.get('http://ifconfig.me').text + + +@app.get("/get_ip") +async def get_ip(): + return {'ip': get_self_ip()} + + +@app.get("/health") +async def health(): + return {"status": "ok"} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='SkyServe HTTP Test Server') + parser.add_argument('--port', type=int, required=True) + args = parser.parse_args() + uvicorn.run(app, host="0.0.0.0", port=args.port) diff --git a/tests/skyserve/interrupt/service.yaml b/tests/skyserve/interrupt/service.yaml new file mode 100644 index 00000000000..e22b2e9994d --- /dev/null +++ b/tests/skyserve/interrupt/service.yaml @@ -0,0 +1,20 @@ +resources: + cloud: gcp + zone: us-central1-a + cpus: 2+ + +workdir: tests/skyserve/interrupt + +setup: pip install fastapi[all] httpx uvicorn + +run: python3 server.py --port 8080 + +service: + port: 8080 + readiness_probe: + path: /health + # For install dependencies + initial_delay_seconds: 180 + replicas: 3 + controller_resources: + cloud: gcp diff --git a/tests/skyserve/interrupt/test_round_robin.py b/tests/skyserve/interrupt/test_round_robin.py new file mode 100644 index 00000000000..0ba4a69eb2e --- /dev/null +++ b/tests/skyserve/interrupt/test_round_robin.py @@ -0,0 +1,36 @@ +import argparse + +import requests + +_REPEAT = 10 + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='SkyServe Smoke Test Client') + parser.add_argument('--endpoint', type=str, required=True) + parser.add_argument('--replica-num', type=int, required=True) + parser.add_argument('--replica-ips', + type=str, + nargs='*', + help="All replica ips") + args = parser.parse_args() + + replica_ips = [] + for r in range(args.replica_num): + url = f'http://{args.endpoint}/get_ip' + resp = requests.get(url) + assert resp.status_code == 200, resp.text + assert 'ip' in resp.json(), resp.json() + ip = resp.json()['ip'] + assert ip not in replica_ips + replica_ips.append(ip) + + assert set(args.replica_ips) == set(replica_ips) + + for i in range(_REPEAT): + for r in range(args.replica_num): + url = f'http://{args.endpoint}/get_ip' + resp = requests.get(url) + assert resp.status_code == 200, resp.text + assert 'ip' in resp.json(), resp.json() + ip = resp.json()['ip'] + assert ip == replica_ips[r] diff --git a/tests/skyserve/llm/get_response.py b/tests/skyserve/llm/get_response.py new file mode 100644 index 00000000000..fc96d0e4a9d --- /dev/null +++ b/tests/skyserve/llm/get_response.py @@ -0,0 +1,33 @@ +import argparse + +import requests + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='SkyServe Smoke Test Client') + parser.add_argument('--endpoint', type=str, required=True) + parser.add_argument('--prompt', type=str, required=True) + args = parser.parse_args() + + messages = [ + { + 'role': 'system', + 'content': 'You are a helpful assistant.' + }, + { + 'role': 'user', + 'content': args.prompt + }, + ] + + url = f'http://{args.endpoint}/v1/chat/completions' + resp = requests.post(url, + json={ + 'model': 'fastchat-t5-3b-v1.0', + 'messages': messages, + 'temperature': 0, + }) + + if 'choices' not in resp.json(): + with open('/home/txia/skypilot/@temp/temp.py', 'a') as f: + f.write(f'Failed: {resp.json()}\n') + print(resp.json()['choices'][0]['message']['content']) diff --git a/tests/skyserve/llm/prompt_output.json b/tests/skyserve/llm/prompt_output.json new file mode 100644 index 00000000000..778cd8a0d35 --- /dev/null +++ b/tests/skyserve/llm/prompt_output.json @@ -0,0 +1,5 @@ +{ + "Introduce yourself!": "Hello! My name is Alex. I'm an AI language model here to assist you with any questions or tasks you may have. How can I help you today?", + "What is Deep Learning? Explain in less than 30 words.": "Deep Learning is a type of machine learning that uses multiple layers of complex algorithms to learn from data.", + "Write a twitter post about skypilot, a framework for running LLMs, AI, and batch jobs on any cloud.": "\"Skypilot is a framework for running machine learning models, AI, and batch jobs on any cloud. With Skypilot, you can easily deploy and manage your machine learning models, AI models, and batch jobs across multiple cloud providers. Say goodbye to the hassle of managing multiple cloud providers and hello to a single platform for all your machine learning and AI needs. Try Skypilot today!\"" +} \ No newline at end of file diff --git a/tests/skyserve/llm/service.yaml b/tests/skyserve/llm/service.yaml new file mode 100644 index 00000000000..07ed0c29068 --- /dev/null +++ b/tests/skyserve/llm/service.yaml @@ -0,0 +1,39 @@ +resources: + cloud: gcp + accelerators: T4 + memory: 32+ + +service: + port: 8087 + readiness_probe: /v1/models + replicas: 1 + controller_resources: + cloud: gcp + +setup: | + conda activate chatbot + if [ $? -ne 0 ]; then + conda create -n chatbot python=3.9 -y + conda activate chatbot + fi + + # Install dependencies + pip install "fschat[model_worker,webui]==0.2.24" + pip install sentencepiece protobuf + +run: | + conda activate chatbot + + echo 'Starting controller...' + python -u -m fastchat.serve.controller > ~/controller.log 2>&1 & + sleep 10 + echo 'Starting model worker...' + python -u -m fastchat.serve.model_worker \ + --model-path lmsys/fastchat-t5-3b-v1.0 2>&1 \ + | tee model_worker.log & + + echo 'Waiting for model worker to start...' + while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting openai api server...' + python -u -m fastchat.serve.openai_api_server --host 0.0.0.0 --port 8087 | tee ~/openai_api_server.log diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 322ea87ffca..dd39b0011af 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -24,8 +24,10 @@ import hashlib import inspect +import json import os import pathlib +import shlex import shutil import subprocess import sys @@ -42,6 +44,7 @@ import sky from sky import global_user_state +from sky import serve from sky.adaptors import cloudflare from sky.adaptors import ibm from sky.clouds import AWS @@ -2606,15 +2609,66 @@ def test_gcp_zero_quota_failover(): # ---------- Testing skyserve ---------- -def _get_skyserve_test_task(name: str, suffix: str, +# TODO(tian): Change this function after #2403 is merged. +def _get_service_name() -> str: + """Returns a user-unique service name for each test_skyserve_(). + + Must be called from each test_skyserve_(). + + SkyServe controller have a 11-character prefix 'controller-'. This will + generate a service name with 24 characters, which add the length of prefix, + is the max length of service name on GCP (35). + """ + caller_func_name = inspect.stack()[1][3] + test_name = caller_func_name.replace('_', '-').replace('test-', 't-') + test_name = test_name.replace('skyserve-', 'ss-') + if len(test_name) >= 16: + test_name = test_name[:11] + '-' + hashlib.md5( + test_name.encode('utf-8')).hexdigest()[:4] + return f'{test_name}-{_smoke_test_hash}-{test_id}' + + +# We check the output of the skyserve service to see if it is ready. Output of +# `REPLICAS` is in the form of `1/2` where the first number is the number of +# ready replicas and the second number is the number of total replicas. We +# grep such format to ensure that the service is ready, and early exit if any +# failure detected. In the end we sleep for serve.CONTROLLER_SYNC_INTERVAL to +# make sure redirector have enough time to sync with the controller and get all +# ready replica IPs. +_SERVE_WAIT_UNTIL_READY = ( + '(while true; do' + ' output=$(sky serve status {name});' + ' echo "$output" | grep -q "{replica_num}/{replica_num}" && break;' + ' echo "$output" | grep -q "FAILED" && exit 1;' + ' sleep 10;' + f' done); sleep {serve.CONTROLLER_SYNC_INTERVAL};') +_IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}' +_ENDPOINT_REGEX = _IP_REGEX + r':[0-9]{1,5}' +_AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag' + + +def _get_serve_endpoint(name: str) -> str: + return f'endpoint=$(sky serve status {name} | grep -Eo "{_ENDPOINT_REGEX}")' + + +def _get_replica_line(name: str, replica_id: int) -> str: + return (f'sky serve status {name} | awk "{_AWK_ALL_LINES_BELOW_REPLICAS}"' + f' | grep -E "{name}\s+{replica_id}"') + + +def _get_replica_ip(name: str, replica_id: int) -> str: + return (f'ip{replica_id}=$({_get_replica_line(name, replica_id)}' + f' | grep -Eo "{_IP_REGEX}")') + + +def _get_skyserve_http_test(name: str, cloud: str, timeout_minutes: int) -> Test: - url_regex = r'([0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]{1,5}' test = Test( - f'test-skyserve-{suffix.replace("_", "-")}', + f'test-skyserve-{cloud.replace("_", "-")}', [ - f'sky serve up -n {name} -y tests/skyserve/http_{suffix}.yaml', - f'(while true; do output=$(sky serve status {name}); echo "$output" | grep -q "2/2" && break; echo "$output" | grep -q "FAILED" && exit 1; sleep 10; done)', - f'url=$(sky serve status {name} | grep -Eo "{url_regex}"); curl -L http://$url | grep "Hi, SkyPilot here"', + f'sky serve up -n {name} -y tests/skyserve/http/{cloud}.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', ], f'sky serve down -y {name}', timeout=timeout_minutes * 60, @@ -2623,35 +2677,112 @@ def _get_skyserve_test_task(name: str, suffix: str, @pytest.mark.gcp -def test_skyserve_gcp(): +def test_skyserve_gcp_http(): """Test skyserve on GCP""" - name = _get_cluster_name() - test = _get_skyserve_test_task(name, 'gcp', 20) + name = _get_service_name() + test = _get_skyserve_http_test(name, 'gcp', 20) run_one_test(test) @pytest.mark.aws -def test_skyserve_aws(): +def test_skyserve_aws_http(): """Test skyserve on AWS""" - name = _get_cluster_name() - test = _get_skyserve_test_task(name, 'aws', 20) + name = _get_service_name() + test = _get_skyserve_http_test(name, 'aws', 20) run_one_test(test) @pytest.mark.azure -def test_skyserve_azure(): +def test_skyserve_azure_http(): """Test skyserve on Azure""" - name = _get_cluster_name() - test = _get_skyserve_test_task(name, 'azure', 30) + name = _get_service_name() + test = _get_skyserve_http_test(name, 'azure', 30) run_one_test(test) @pytest.mark.gcp @pytest.mark.aws -def test_skyserve_mixed_cloud(): +def test_skyserve_mixed_cloud_http(): """Test skyserve on mixed cloud""" - name = _get_cluster_name() - test = _get_skyserve_test_task(name, 'mixed_cloud', 20) + name = _get_service_name() + test = _get_skyserve_http_test(name, 'mixed_cloud', 20) + run_one_test(test) + + +@pytest.mark.gcp +def test_skyserve_llm(): + """Test skyserve with real LLM usecase""" + name = _get_service_name() + + def generate_llm_test_command(prompt: str, expected_output: str) -> str: + prompt = shlex.quote(prompt) + expected_output = shlex.quote(expected_output) + return ( + f'{_get_serve_endpoint(name)}; python tests/skyserve/llm/get_response.py' + f' --endpoint $endpoint --prompt {prompt} | grep {expected_output}') + + with open('tests/skyserve/llm/prompt_output.json', 'r') as f: + prompt2output = json.load(f) + + test = Test( + f'test-skyserve-llm', + [ + f'sky serve up -n {name} -y tests/skyserve/llm/service.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + *[ + generate_llm_test_command(prompt, output) + for prompt, output in prompt2output.items() + ], + ], + f'sky serve down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_skyserve_interrupt(): + """Test skyserve with manually interrupting some replica""" + name = _get_service_name() + zone = 'us-central1-a' + + # Reference: test_spot_recovery_gcp + def terminate_cmd(replica_id: int) -> str: + cluster_name = serve.generate_replica_cluster_name(name, replica_id) + query_cmd = (f'gcloud compute instances list --filter=' + f'"(labels.ray-cluster-name:{cluster_name})" ' + f'--zones={zone} --format="value(name)"') + return (f'gcloud compute instances delete --zone={zone}' + f' --quiet $({query_cmd})') + + test = Test( + f'test-skyserve-interrupt', + [ + f'sky serve up -n {name} -y tests/skyserve/interrupt/service.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3), + f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 1)}; ' + f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; ' + 'python tests/skyserve/interrupt/test_round_robin.py ' + '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3', + terminate_cmd(1), + f'sleep {serve.CONTROLLER_SYNC_INTERVAL}', + f'sky serve status {name} | grep 2/3', + f'{_get_replica_line(name, 1)} | grep NOT_READY', + f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 2)}; ' + f'{_get_replica_ip(name, 3)}; ' + 'python tests/skyserve/interrupt/test_round_robin.py ' + '--endpoint $endpoint --replica-num 2 --replica-ips $ip2 $ip3', + terminate_cmd(2), + f'sleep {serve.CONTROLLER_SYNC_INTERVAL}', + f'sky serve status {name} | grep 1/3', + f'{_get_replica_line(name, 2)} | grep NOT_READY', + f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 3)}; ' + 'python tests/skyserve/interrupt/test_round_robin.py ' + '--endpoint $endpoint --replica-num 1 --replica-ips $ip3', + ], + f'sky serve down -y {name}', + timeout=20 * 60, + ) run_one_test(test) From 6fae8ac8ea19b9f65e3f603f2aebaec8c9b909ba Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 14:13:54 -0700 Subject: [PATCH 022/223] early check cluster name is valid --- sky/execution.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/execution.py b/sky/execution.py index 938e52301a7..a2cb55cc7d9 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -976,6 +976,8 @@ def serve_up( """ controller_cluster_name = serve.generate_controller_cluster_name( service_name) + controller_best_resources.cloud.check_cluster_name_is_valid( + controller_cluster_name) assert task.service is not None, task assert len(task.resources) == 1, task requested_resources = list(task.resources)[0] From 21c07ccb4645e69a9cfbe94f901c5e916baf94ac Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 14:18:16 -0700 Subject: [PATCH 023/223] add hint message for tailing replica job status --- sky/serve/serve_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 979b6149658..b7e176c81aa 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -281,6 +281,10 @@ def _get_replica_status() -> status_lib.ReplicaStatus: # Early exit if not following the logs. return '' + # Notify user here to make sure user won't think the log is finished. + print(f'{colorama.Fore.YELLOW}Start streaming logs for task job ' + f'of replica {replica_id}.{colorama.Style.RESET_ALL}') + backend = backends.CloudVmRayBackend() # Always tail the logs of the first job, which represent user setup & run. returncode = backend.tail_logs(handle, job_id=1, follow=follow) From 377789bf1c58faf84030ab47a371da88579a4654 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 14:52:43 -0700 Subject: [PATCH 024/223] make dict thread safe --- sky/serve/infra_providers.py | 8 +++++--- sky/serve/serve_utils.py | 37 ++++++++++++++++++++++++++++++++++++ sky/serve/service_spec.py | 2 +- 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 84ec119aaf5..b9f3f8fbe9c 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -200,7 +200,7 @@ def __init__( initial_delay_seconds: int, post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: # TODO(tian): make this thread safe - self.replica_info: Dict[str, ReplicaInfo] = dict() + self.replica_info: Dict[str, ReplicaInfo] = serve_utils.ThreadSafeDict() self.readiness_suffix: str = readiness_suffix self.initial_delay_seconds: int = initial_delay_seconds self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data @@ -251,8 +251,10 @@ def __init__(self, task_yaml_path: str, service_name: str, *args, self.task_yaml_path: str = task_yaml_path self.service_name: str = service_name self.next_replica_id: int = 1 - self.launch_process_pool: Dict[str, subprocess.Popen] = dict() - self.down_process_pool: Dict[str, subprocess.Popen] = dict() + self.launch_process_pool: Dict[ + str, subprocess.Popen] = serve_utils.ThreadSafeDict() + self.down_process_pool: Dict[ + str, subprocess.Popen] = serve_utils.ThreadSafeDict() self._start_process_pool_refresher() self._start_job_status_fetcher() diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index b7e176c81aa..9c2dff28084 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -4,6 +4,7 @@ import pickle import re import shlex +import threading import time import typing from typing import Any, Callable, Dict, Iterator, List, Optional, TextIO @@ -30,6 +31,42 @@ f' to check all valid replica id.{colorama.Style.RESET_ALL}') +class ThreadSafeDict(dict): + """A thread-safe dict.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._lock = threading.Lock() + + def __getitem__(self, __key: Any) -> Any: + with self._lock: + return super().__getitem__(__key) + + def __setitem__(self, __key: Any, __value: Any) -> None: + with self._lock: + return super().__setitem__(__key, __value) + + def __delitem__(self, __key: Any) -> None: + with self._lock: + return super().__delitem__(__key) + + def __len__(self) -> int: + with self._lock: + return super().__len__() + + def __contains__(self, __key: object) -> bool: + with self._lock: + return super().__contains__(__key) + + def items(self): + with self._lock: + return super().items() + + def values(self): + with self._lock: + return super().values() + + def generate_controller_cluster_name(service_name: str) -> str: return constants.CONTROLLER_PREFIX + service_name diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index db06ab79d28..0019aebc8d1 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -67,7 +67,7 @@ def from_yaml_config(config: Optional[Dict[str, Any]]): return None backend_utils.validate_schema(config, schemas.get_service_schema(), - 'Invalid service YAML:') + 'Invalid service YAML: ') if 'replicas' in config and 'replica_policy' in config: with ux_utils.print_exception_no_traceback(): raise ValueError( From b2f2388fd812d8561e0e2bc2c495d55d7f555d86 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 14:53:25 -0700 Subject: [PATCH 025/223] upd doc --- sky/serve/README.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sky/serve/README.md b/sky/serve/README.md index 4d86bbf4bbd..f8f33df6244 100644 --- a/sky/serve/README.md +++ b/sky/serve/README.md @@ -15,11 +15,5 @@ Sky Serve has four key components: 4. Infra Providers - provides a uniform interface to talk to SkyPilot. ## Usage -** Work in progress** -```bash -# Run controller. -python -m sky.serve.controller --task-yaml examples/fastchat/api_server.yaml -# Run redirector. -python -m sky.serve.redirector -``` \ No newline at end of file +[User doc](https://docs.google.com/document/d/1vVmzLF-EkG3Moj-q47DQBGvFipK4PNfkz0V6LyaPstE/edit) From 7bfeec9072cca2545e462b54975ed5bddd577ba0 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 18:45:22 -0700 Subject: [PATCH 026/223] rename redirector to lb --- sky/backends/cloud_vm_ray_backend.py | 2 +- sky/cli.py | 22 +++---- sky/execution.py | 33 ++++++----- sky/serve/autoscalers.py | 2 +- sky/serve/controller.py | 10 ++-- sky/serve/examples/http_server/task.yaml | 3 + sky/serve/{redirector.py => load_balancer.py} | 59 ++++++++++--------- ...ancers.py => load_balancing_algorithms.py} | 10 ++-- sky/status_lib.py | 2 +- tests/test_smoke.py | 4 +- 10 files changed, 78 insertions(+), 69 deletions(-) rename sky/serve/{redirector.py => load_balancer.py} (66%) rename sky/serve/{load_balancers.py => load_balancing_algorithms.py} (90%) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index acaa003563d..553a2a9944e 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3202,7 +3202,7 @@ def _exec_code_on_head( f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') elif not name.startswith(serve_lib.CONTROLLER_PREFIX): - # Skip logging for submit controller & redirector jobs + # Skip logging for submit controller & load balancer jobs # to skyserve controller cluster logger.info(f'{fore.CYAN}Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' diff --git a/sky/cli.py b/sky/cli.py index 0bed3a39731..74acf726f10 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4141,11 +4141,11 @@ def serve_status(all: bool, service_name: Optional[str]): - ``CONTROLLER_INIT``: The controller is initializing. - ``REPLICA_INIT``: The controller provisioning have succeeded; controller - and redirector process is alive, and there are no available replicas for - now. This also indicates that no replica failure has been detected. + and load balancer process is alive, and there are no available replicas + for now. This also indicates that no replica failure has been detected. - ``CONTROLLER_FAILED``: The controller failed to start or in an abnormal - state; or the controller and redirector process is not alive. + state; or the controller and load balancer process is not alive. - ``READY``: The controller is ready to serve requests. This means that at least one replica have passed the readiness probe. @@ -4351,11 +4351,11 @@ def _down_service(name: str): default=False, required=False, help='Show the controller logs of this service.') -@click.option('--redirector', +@click.option('--load-balancer', is_flag=True, default=False, required=False, - help='Show the redirector logs of this service.') + help='Show the load balancer logs of this service.') @click.argument('service_name', required=True, type=str, @@ -4366,7 +4366,7 @@ def serve_logs( service_name: str, follow: bool, controller: bool, - redirector: bool, + load_balancer: bool, replica_id: Optional[int], ): """Tail the log of a service. @@ -4378,16 +4378,16 @@ def serve_logs( # Tail the controller logs of a service sky serve logs --controller [SERVICE_ID] \b - # Print the redirector logs so far and exit - sky serve logs --redirector --no-follow [SERVICE_ID] + # Print the load balancer logs so far and exit + sky serve logs --load-balancer --no-follow [SERVICE_ID] \b # Tail the logs of replica 1 sky serve logs [SERVICE_ID] 1 """ have_replica_id = replica_id is not None - if (controller + redirector + have_replica_id) != 1: + if (controller + load_balancer + have_replica_id) != 1: raise click.UsageError( - 'One and only one of --controller, --redirector, ' + 'One and only one of --controller, --load-balancer, ' '[REPLICA_ID] can be specified.') service_record = global_user_state.get_service_from_name(service_name) if service_record is None: @@ -4396,7 +4396,7 @@ def serve_logs( controller_cluster_name = service_record['handle'].controller_cluster_name if controller: core.tail_logs(controller_cluster_name, job_id=1, follow=follow) - elif redirector: + elif load_balancer: core.tail_logs(controller_cluster_name, job_id=2, follow=follow) else: core.serve_tail_logs(service_record, replica_id, follow=follow) diff --git a/sky/execution.py b/sky/execution.py index a2cb55cc7d9..cd7fb33e84c 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1078,7 +1078,7 @@ def _wait_until_job_is_running(cluster_name: str, # NOTICE: The job submission order cannot be changed since the # `sky serve logs` CLI will identify the controller job with - # the first job submitted and the redirector job with the second + # the first job submitted and the load balancer job with the second # job submitted. with console.status('[yellow]Launching controller process...[/yellow]'): _execute( @@ -1106,13 +1106,14 @@ def _wait_until_job_is_running(cluster_name: str, print(f'{colorama.Fore.GREEN}Launching controller process...done.' f'{colorama.Style.RESET_ALL}') - with console.status('[yellow]Launching redirector process...[/yellow]'): + with console.status( + '[yellow]Launching load balancer process...[/yellow]'): controller_addr = f'http://localhost:{serve.CONTROLLER_PORT}' _execute( entrypoint=sky.Task( - name='run-redirector', + name='run-load-balancer', envs=controller_envs, - run='python -m sky.serve.redirector --task-yaml ' + run='python -m sky.serve.load_balancer --task-yaml ' f'{remote_task_yaml_path} --port {app_port} ' f'--controller-addr {controller_addr}'), stream_logs=False, @@ -1121,16 +1122,16 @@ def _wait_until_job_is_running(cluster_name: str, cluster_name=controller_cluster_name, detach_run=True, ) - redirector_job_is_running = _wait_until_job_is_running( + load_balancer_job_is_running = _wait_until_job_is_running( controller_cluster_name, 2) - if not redirector_job_is_running: + if not load_balancer_job_is_running: global_user_state.set_service_status( service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - print(f'{colorama.Fore.RED}Redirector failed to launch. ' + print(f'{colorama.Fore.RED}LoadBalancer failed to launch. ' f'Please check the logs with sky serve logs {service_name} ' - f'--redirector{colorama.Style.RESET_ALL}') + f'--load-balancer{colorama.Style.RESET_ALL}') return - print(f'{colorama.Fore.GREEN}Launching redirector process...done.' + print(f'{colorama.Fore.GREEN}Launching load balancer process...done.' f'{colorama.Style.RESET_ALL}') global_user_state.set_service_status( @@ -1141,15 +1142,15 @@ def _wait_until_job_is_running(cluster_name: str, '\nTo see detailed info:' f'\t\t{backend_utils.BOLD}sky serve status {service_name} (-a)' f'{backend_utils.RESET_BOLD}' - '\nTo see logs of controller:' - f'\t{backend_utils.BOLD}sky serve logs --controller ' - f'{service_name}{backend_utils.RESET_BOLD}' - '\nTo see logs of redirector:' - f'\t{backend_utils.BOLD}sky serve logs --redirector ' - f'{service_name}{backend_utils.RESET_BOLD}' '\nTo see logs of one replica:' f'\t{backend_utils.BOLD}sky serve logs {service_name} ' f'[REPLICA_ID]{backend_utils.RESET_BOLD}' + '\nTo see logs of load balancer:' + f'\t{backend_utils.BOLD}sky serve logs --load-balancer ' + f'{service_name}{backend_utils.RESET_BOLD}' + '\nTo see logs of controller:' + f'\t{backend_utils.BOLD}sky serve logs --controller ' + f'{service_name}{backend_utils.RESET_BOLD}' '\nTo teardown the service:' f'\t{backend_utils.BOLD}sky serve down {service_name}' f'{backend_utils.RESET_BOLD}' @@ -1238,7 +1239,7 @@ def serve_down( exceptions.CommandError) as e: if purge: logger.warning('Ignoring error when stopping controller and ' - f'redirector jobs of service {service_name}: {e}') + f'load balancer jobs of service {service_name}: {e}') else: raise RuntimeError(e) from e diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 6bd4ab74bc3..bc712aaa696 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -80,7 +80,7 @@ def __init__(self, *args, upper_threshold: Optional[float], self.cooldown: int = cooldown # Query interval for requests num. Every `query_interval` seconds, # Autoscaler will received an update for number of requests from - # redirector. + # load balancer. self.query_interval: int = query_interval # Time of last scale operation self.last_scale_operation: float = 0. diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 9493808908b..80de83325d0 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -1,4 +1,4 @@ -"""Controller: the central controller of SkyServe. +"""SkyServeController: the central controller of SkyServe. Responsible for autoscaling and replica management. """ @@ -35,8 +35,8 @@ def filter(self, record: logging.LogRecord) -> bool: return not ('GET' in message and '200' in message) -class Controller: - """Controller: control everything about replica. +class SkyServeController: + """SkyServeController: control everything about replica. This class is responsible for: - Starting and terminating the replica monitor and autoscaler. @@ -153,6 +153,6 @@ def terminate(request: fastapi.Request): cooldown=60, query_interval=60) - # ======= Controller ========= - controller = Controller(args.port, _infra_provider, _autoscaler) + # ======= SkyServeController ========= + controller = SkyServeController(args.port, _infra_provider, _autoscaler) controller.run() diff --git a/sky/serve/examples/http_server/task.yaml b/sky/serve/examples/http_server/task.yaml index 965e7a5c39f..967abfffeff 100644 --- a/sky/serve/examples/http_server/task.yaml +++ b/sky/serve/examples/http_server/task.yaml @@ -1,4 +1,5 @@ resources: + cloud: gcp cpus: 2+ workdir: sky/serve/examples/http_server @@ -11,3 +12,5 @@ service: path: /health initial_delay_seconds: 20 replicas: 2 + controller_resources: + cloud: gcp diff --git a/sky/serve/redirector.py b/sky/serve/load_balancer.py similarity index 66% rename from sky/serve/redirector.py rename to sky/serve/load_balancer.py index a94819bc7f0..9f7ebac5ee7 100644 --- a/sky/serve/redirector.py +++ b/sky/serve/load_balancer.py @@ -1,4 +1,4 @@ -"""Redirector: redirect any incoming request to an endpoint replica.""" +"""LoadBalancer: redirect any incoming request to an endpoint replica.""" import argparse import threading import time @@ -9,39 +9,42 @@ from sky import sky_logging from sky.serve import constants -from sky.serve import load_balancers +from sky.serve import load_balancing_algorithms # Use the explicit logger name so that the logger is under the -# `sky.serve.redirector` namespace when executed directly, so as +# `sky.serve.load_balancer` namespace when executed directly, so as # to inherit the setup from the `sky` logger. -logger = sky_logging.init_logger('sky.serve.redirector') +logger = sky_logging.init_logger('sky.serve.load_balancer') +_LBAlgorithm = load_balancing_algorithms.LoadBalancingAlgorithm -class SkyServeRedirector: - """Redirector: redirect incoming traffic. + +class SkyServeLoadBalancer: + """SkyServeLoadBalancer: redirect incoming traffic. This class accept any traffic to the controller and redirect it - to the appropriate endpoint replica. + to the appropriate endpoint replica according to the load balancing + algorithm. """ def __init__(self, controller_url: str, port: int, - load_balancer: load_balancers.LoadBalancer): + load_balancing_algorithm: _LBAlgorithm) -> None: self.app = fastapi.FastAPI() self.controller_url = controller_url self.port = port - self.load_balancer = load_balancer + self.lb_algorithm = load_balancing_algorithm for i in range(3): resp = requests.get(self.controller_url + '/controller/get_autoscaler_query_interval') if resp.status_code == 200: - self.load_balancer.set_query_interval( + self.lb_algorithm.set_query_interval( resp.json()['query_interval']) break if i == 2: logger.error('Failed to get autoscaler query interval. ' 'Use default interval instead.') - self.load_balancer.set_query_interval(None) + self.lb_algorithm.set_query_interval(None) time.sleep(10) def _sync_with_controller(self): @@ -53,7 +56,7 @@ def _sync_with_controller(self): self.controller_url + '/controller/update_num_requests', json={ 'num_requests': - self.load_balancer.deprecate_old_requests() + self.lb_algorithm.deprecate_old_requests() }, timeout=5) response.raise_for_status() @@ -66,12 +69,12 @@ def _sync_with_controller(self): print(f'An error occurred: {e}') else: logger.info(f'Available Replica IPs: {ready_replicas}') - self.load_balancer.set_ready_replicas(ready_replicas) + self.lb_algorithm.set_ready_replicas(ready_replicas) time.sleep(constants.CONTROLLER_SYNC_INTERVAL) - async def _redirector_handler(self, request: fastapi.Request): - self.load_balancer.increment_request_count(1) - replica_ip = self.load_balancer.select_replica(request) + async def _redirect_handler(self, request: fastapi.Request): + self.lb_algorithm.increment_request_count(1) + replica_ip = self.lb_algorithm.select_replica(request) if replica_ip is None: raise fastapi.HTTPException(status_code=503, @@ -85,7 +88,7 @@ async def _redirector_handler(self, request: fastapi.Request): def run(self): self.app.add_api_route('/{path:path}', - self._redirector_handler, + self._redirect_handler, methods=['GET', 'POST', 'PUT', 'DELETE']) sync_controller_thread = threading.Thread( @@ -93,14 +96,14 @@ def run(self): sync_controller_thread.start() logger.info( - f'SkyServe Redirector started on http://0.0.0.0:{self.port}') + f'SkyServe Load Balancer started on http://0.0.0.0:{self.port}') uvicorn.run(self.app, host='0.0.0.0', port=self.port) if __name__ == '__main__': # Add argparse - parser = argparse.ArgumentParser(description='SkyServe Redirector') + parser = argparse.ArgumentParser(description='SkyServe Load Balancer') parser.add_argument('--task-yaml', type=str, help='Task YAML file', @@ -108,7 +111,7 @@ def run(self): parser.add_argument('--port', '-p', type=int, - help='Port to run the redirector on.', + help='Port to run the load balancer on.', required=True) parser.add_argument('--controller-addr', type=str, @@ -116,11 +119,13 @@ def run(self): required=True) args = parser.parse_args() - # ======= Load Balancer ========= - _load_balancer = load_balancers.RoundRobinLoadBalancer() + # ======= Load Balancing Algorithm ========= + _load_balancing_algorithm = ( + load_balancing_algorithms.RoundRobinLoadBalancingAlgorithm()) - # ======= Redirector ========= - redirector = SkyServeRedirector(controller_url=args.controller_addr, - port=args.port, - load_balancer=_load_balancer) - redirector.run() + # ======= SkyServeLoadBalancer ========= + load_balancer = SkyServeLoadBalancer( + controller_url=args.controller_addr, + port=args.port, + load_balancing_algorithm=_load_balancing_algorithm) + load_balancer.run() diff --git a/sky/serve/load_balancers.py b/sky/serve/load_balancing_algorithms.py similarity index 90% rename from sky/serve/load_balancers.py rename to sky/serve/load_balancing_algorithms.py index 60915fe3ae3..b0c943b757e 100644 --- a/sky/serve/load_balancers.py +++ b/sky/serve/load_balancing_algorithms.py @@ -1,4 +1,4 @@ -"""LoadBalancer: select endpoint by load balancing algorithm.""" +"""LoadBalancingAlgorithms: algorithm to select endpoint.""" from collections import deque import logging import time @@ -11,8 +11,8 @@ _DEFAULT_QUERY_INTERVAL = 60 -class LoadBalancer: - """Abstract class for load balancers.""" +class LoadBalancingAlgorithm: + """Abstract class for load balancing algorithms.""" def __init__(self) -> None: self.ready_replicas: Set[str] = set() @@ -49,8 +49,8 @@ def select_replica(self, request: fastapi.Request) -> Optional[str]: raise NotImplementedError -class RoundRobinLoadBalancer(LoadBalancer): - """Round-robin load balancer.""" +class RoundRobinLoadBalancingAlgorithm(LoadBalancingAlgorithm): + """Round-robin load balancing algorithm.""" def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) diff --git a/sky/status_lib.py b/sky/status_lib.py index dc0d2f080eb..2d093818807 100644 --- a/sky/status_lib.py +++ b/sky/status_lib.py @@ -60,7 +60,7 @@ class ServiceStatus(enum.Enum): # Replica is initializing and no failure REPLICA_INIT = 'REPLICA_INIT' - # Controller failed to initialize / controller or redirector process + # Controller failed to initialize / controller or load balancer process # status abnormal CONTROLLER_FAILED = 'CONTROLLER_FAILED' diff --git a/tests/test_smoke.py b/tests/test_smoke.py index dd39b0011af..27791cb0a40 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2633,8 +2633,8 @@ def _get_service_name() -> str: # ready replicas and the second number is the number of total replicas. We # grep such format to ensure that the service is ready, and early exit if any # failure detected. In the end we sleep for serve.CONTROLLER_SYNC_INTERVAL to -# make sure redirector have enough time to sync with the controller and get all -# ready replica IPs. +# make sure load balancer have enough time to sync with the controller and get +# all ready replica IPs. _SERVE_WAIT_UNTIL_READY = ( '(while true; do' ' output=$(sky serve status {name});' From 082c88b7a38c34c5e6af09bd5d8a48dd446820d5 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 21:25:20 -0700 Subject: [PATCH 027/223] rename sky serve controller prefix --- sky/serve/constants.py | 4 ++-- ...yserve-controller.yaml.j2 => sky-serve-controller.yaml.j2} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename sky/templates/{skyserve-controller.yaml.j2 => sky-serve-controller.yaml.j2} (100%) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 10468bc1e5a..91ea9b48c46 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -1,8 +1,8 @@ """Constants used for SkyServe.""" -CONTROLLER_PREFIX = 'controller-' +CONTROLLER_PREFIX = 'sky-serve-controller-' -CONTROLLER_TEMPLATE = 'skyserve-controller.yaml.j2' +CONTROLLER_TEMPLATE = 'sky-serve-controller.yaml.j2' SERVE_PREFIX = '~/.sky/serve' diff --git a/sky/templates/skyserve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 similarity index 100% rename from sky/templates/skyserve-controller.yaml.j2 rename to sky/templates/sky-serve-controller.yaml.j2 From a4c8fc293c0d331e326be3987031938f8d713b8e Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 21:27:28 -0700 Subject: [PATCH 028/223] restore example --- sky/serve/examples/http_server/task.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/sky/serve/examples/http_server/task.yaml b/sky/serve/examples/http_server/task.yaml index 967abfffeff..965e7a5c39f 100644 --- a/sky/serve/examples/http_server/task.yaml +++ b/sky/serve/examples/http_server/task.yaml @@ -1,5 +1,4 @@ resources: - cloud: gcp cpus: 2+ workdir: sky/serve/examples/http_server @@ -12,5 +11,3 @@ service: path: /health initial_delay_seconds: 20 replicas: 2 - controller_resources: - cloud: gcp From 63b2601cbbb1271d4cd27133fb0fc342f110667a Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Aug 2023 22:33:29 -0700 Subject: [PATCH 029/223] upd smoke test --- tests/test_smoke.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 33393befce0..40fd729f118 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2639,23 +2639,16 @@ def test_gcp_zero_quota_failover(): # ---------- Testing skyserve ---------- -# TODO(tian): Change this function after #2403 is merged. def _get_service_name() -> str: """Returns a user-unique service name for each test_skyserve_(). Must be called from each test_skyserve_(). - - SkyServe controller have a 11-character prefix 'controller-'. This will - generate a service name with 24 characters, which add the length of prefix, - is the max length of service name on GCP (35). """ caller_func_name = inspect.stack()[1][3] test_name = caller_func_name.replace('_', '-').replace('test-', 't-') test_name = test_name.replace('skyserve-', 'ss-') - if len(test_name) >= 16: - test_name = test_name[:11] + '-' + hashlib.md5( - test_name.encode('utf-8')).hexdigest()[:4] - return f'{test_name}-{_smoke_test_hash}-{test_id}' + test_name = common_utils.make_cluster_name_on_cloud(test_name, 24) + return f'{test_name}-{test_id}' # We check the output of the skyserve service to see if it is ready. Output of From 14621d3dc6d29ec9e28fe8b3231f500cbab7bd00 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 26 Aug 2023 21:10:33 -0700 Subject: [PATCH 030/223] use asyncio --- sky/serve/controller.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 80de83325d0..c557e915873 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -3,6 +3,7 @@ Responsible for autoscaling and replica management. """ import argparse +import asyncio import base64 import logging import pickle @@ -55,9 +56,9 @@ def __init__(self, def run(self) -> None: @self.app.post('/controller/update_num_requests') - async def update_num_requests(request: fastapi.Request): + def update_num_requests(request: fastapi.Request): # await request - request_data = await request.json() + request_data = asyncio.run(request.json()) # get request data num_requests = request_data['num_requests'] logger.info(f'Received request: {request_data}') From 55d3503b10e6e21a37a0fc961f6a9318e51fa9e4 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 27 Aug 2023 00:48:33 -0700 Subject: [PATCH 031/223] change core with underlying function to avoid usage collection on status --- sky/backends/backend_utils.py | 3 +- sky/backends/cloud_vm_ray_backend.py | 2 +- sky/core.py | 53 ++++++++++++---------------- sky/execution.py | 16 +++++++-- sky/serve/infra_providers.py | 10 +++--- sky/skylet/job_lib.py | 2 +- 6 files changed, 45 insertions(+), 41 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index b5b895b1cb3..78d0c81e969 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1020,7 +1020,8 @@ def write_cluster_config( 'disk_size': to_provision.disk_size, # If the current code is run by controller, propagate the real # calling user which should've been passed in as the - # SKYPILOT_USER env var (see spot-controller.yaml.j2). + # SKYPILOT_USER env var (see spot-controller.yaml.j2), also + # execution.py::serve_up. 'user': get_cleaned_username(os.environ.get( 'SKYPILOT_USER', '')), diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index da10b14e360..b16ace2ab5a 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3468,7 +3468,7 @@ def get_job_status( handle: CloudVmRayResourceHandle, job_ids: Optional[List[int]] = None, stream_logs: bool = True - ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]: + ) -> Dict[Optional[str], Optional[job_lib.JobStatus]]: code = job_lib.JobLibCodeGen.get_job_status(job_ids) returncode, stdout, stderr = self.run_on_head(handle, code, diff --git a/sky/core.py b/sky/core.py index 22058d730b7..d45988e8ca0 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1,7 +1,7 @@ """SDK functions for cluster/job management.""" import getpass import sys -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import colorama @@ -560,7 +560,6 @@ def cancel( job_ids: Optional[List[int]] = None, # pylint: disable=invalid-name _try_cancel_if_cluster_is_init: bool = False, - _from_serve_core: bool = False, ) -> None: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Cancel jobs on a cluster. @@ -585,9 +584,8 @@ def cancel( sky.exceptions.CloudUserIdentityError: if we fail to get the current user identity. """ - if not _from_serve_core: - backend_utils.check_cluster_name_not_reserved( - cluster_name, operation_str='Cancelling jobs') + backend_utils.check_cluster_name_not_reserved( + cluster_name, operation_str='Cancelling jobs') if all and job_ids: raise ValueError('Cannot specify both `all` and `job_ids`. To cancel ' @@ -618,30 +616,28 @@ def cancel( backend = backend_utils.get_backend_from_handle(handle) - printfn: Callable[[str], None] = sky_logging.print - if _from_serve_core: - printfn = lambda msg: None - if all: - printfn(f'{colorama.Fore.YELLOW}' - f'Cancelling all jobs on cluster {cluster_name!r}...' - f'{colorama.Style.RESET_ALL}') + sky_logging.print(f'{colorama.Fore.YELLOW}' + f'Cancelling all jobs on cluster {cluster_name!r}...' + f'{colorama.Style.RESET_ALL}') elif job_ids is None: # all = False, job_ids is None => cancel the latest running job. - printfn(f'{colorama.Fore.YELLOW}' - f'Cancelling latest running job on cluster {cluster_name!r}...' - f'{colorama.Style.RESET_ALL}') + sky_logging.print( + f'{colorama.Fore.YELLOW}' + f'Cancelling latest running job on cluster {cluster_name!r}...' + f'{colorama.Style.RESET_ALL}') elif len(job_ids): # all = False, len(job_ids) > 0 => cancel the specified jobs. jobs_str = ', '.join(map(str, job_ids)) - printfn(f'{colorama.Fore.YELLOW}' - f'Cancelling jobs ({jobs_str}) on cluster {cluster_name!r}...' - f'{colorama.Style.RESET_ALL}') + sky_logging.print( + f'{colorama.Fore.YELLOW}' + f'Cancelling jobs ({jobs_str}) on cluster {cluster_name!r}...' + f'{colorama.Style.RESET_ALL}') else: # all = False, len(job_ids) == 0 => no jobs to cancel. return - backend.cancel_jobs(handle, job_ids, all, silent=_from_serve_core) + backend.cancel_jobs(handle, job_ids, all) @usage_lib.entrypoint @@ -726,12 +722,10 @@ def download_logs( @usage_lib.entrypoint -def job_status( - cluster_name: str, - job_ids: Optional[List[int]], - silent: bool = False, - stream_logs: bool = False -) -> Dict[Optional[int], Optional[job_lib.JobStatus]]: +def job_status(cluster_name: str, + job_ids: Optional[List[int]], + stream_logs: bool = False + ) -> Dict[Optional[str], Optional[job_lib.JobStatus]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Get the status of jobs. @@ -739,7 +733,7 @@ def job_status( cluster_name: (str) name of the cluster. job_ids: (List[str]) job ids. If None, get the status of the last job. Returns: - Dict[Optional[int], Optional[job_lib.JobStatus]]: A mapping of job_id to + Dict[Optional[str], Optional[job_lib.JobStatus]]: A mapping of job_id to job statuses. The status will be None if the job does not exist. If job_ids is None and there is no job on the cluster, it will return {None: None}. @@ -768,10 +762,9 @@ def job_status( if job_ids is not None and len(job_ids) == 0: return {} - if not silent: - sky_logging.print(f'{colorama.Fore.YELLOW}' - 'Getting job status...' - f'{colorama.Style.RESET_ALL}') + sky_logging.print(f'{colorama.Fore.YELLOW}' + 'Getting job status...' + f'{colorama.Style.RESET_ALL}') usage_lib.record_cluster_name_for_current_operation(cluster_name) statuses = backend.get_job_status(handle, job_ids, stream_logs=stream_logs) diff --git a/sky/execution.py b/sky/execution.py index 48d85d261c3..d28fa5514b8 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1027,7 +1027,9 @@ def serve_up( controller_task.best_resources = controller_best_resources controller_envs = { + 'SKYPILOT_USER_ID': common_utils.get_user_hash(), 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': True, + 'SKYPILOT_USER': getpass.getuser(), 'SKYPILOT_DEV': env_options.Options.IS_DEVELOPER.get(), 'SKYPILOT_DEBUG': env_options.Options.SHOW_DEBUG_INFO.get(), 'SKYPILOT_DISABLE_USAGE_COLLECTION': @@ -1067,9 +1069,14 @@ def serve_up( def _wait_until_job_is_running(cluster_name: str, job_id: int, retry_time: int = 30) -> bool: + handle = global_user_state.get_handle_from_cluster_name( + cluster_name) + assert isinstance(handle, backends.CloudVmRayResourceHandle), handle + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend), backend for _ in range(retry_time): - job_statuses = core.job_status(cluster_name, [job_id], - silent=True) + job_statuses = backend.get_job_status(handle, [job_id], + stream_logs=False) job_status = job_statuses.get(str(job_id), None) if job_status == job_lib.JobStatus.RUNNING: return True @@ -1234,7 +1241,10 @@ def serve_down( ) try: - core.cancel(controller_cluster_name, all=True, _from_serve_core=True) + if handle is not None: + assert isinstance(handle, backends.CloudVmRayResourceHandle) + backend = backends.CloudVmRayBackend() + backend.cancel_jobs(handle, jobs=None, cancel_all=True, silent=True) except (ValueError, exceptions.ClusterNotUpError, exceptions.CommandError) as e: if purge: diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index b9f3f8fbe9c..815ca3951d5 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -13,7 +13,6 @@ import requests from sky import backends -from sky import core from sky import global_user_state from sky import status_lib from sky.serve import serve_utils @@ -334,8 +333,12 @@ def _fetch_job_status(self) -> None: for cluster_name, info in self.replica_info.items(): if not info.status_property.should_track_status(): continue + backend = backends.CloudVmRayBackend() + handle = info.handle + assert handle is not None, info # Only fetch job 1, which stands for user task job - job_statuses = core.job_status(cluster_name, [1]) + job_statuses = backend.get_job_status(handle, [1], + stream_logs=False) job_status = job_statuses['1'] if job_status in [ job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP @@ -343,9 +346,6 @@ def _fetch_job_status(self) -> None: info.status_property.user_app_failed = True logger.warning(f'User APP for cluster {cluster_name} FAILED. ' 'Start streaming logs...') - backend = backends.CloudVmRayBackend() - handle = info.handle - assert handle is not None, info # Always tail the logs of the first job, which represent user # setup & run. try: diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py index 44787ebd429..5126f40b960 100644 --- a/sky/skylet/job_lib.py +++ b/sky/skylet/job_lib.py @@ -367,7 +367,7 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str: def load_statuses_payload( - statuses_payload: str) -> Dict[Optional[int], Optional[JobStatus]]: + statuses_payload: str) -> Dict[Optional[str], Optional[JobStatus]]: statuses = common_utils.decode_payload(statuses_payload) for job_id, status in statuses.items(): if status is not None: From c9d2e2815e31e97be13019edf7df5e17e75bce5c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 27 Aug 2023 00:50:40 -0700 Subject: [PATCH 032/223] reatore comma --- sky/utils/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index e0a5235b0da..3c664615efb 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -276,7 +276,7 @@ def get_task_schema(): 'additionalProperties': { 'type': 'number' } - } + }, } } From d236eec49cbde901d9c534ce5698c356c4674fe7 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 27 Aug 2023 11:42:35 -0700 Subject: [PATCH 033/223] add comment --- sky/serve/infra_providers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 815ca3951d5..83d8ff7cfd0 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -333,6 +333,8 @@ def _fetch_job_status(self) -> None: for cluster_name, info in self.replica_info.items(): if not info.status_property.should_track_status(): continue + # We use backend API to avoid usage collection in the + # core.job_status. backend = backends.CloudVmRayBackend() handle = info.handle assert handle is not None, info From e3e698c4e25859a33c250d02906ee8bb4589c2bb Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 27 Aug 2023 20:52:41 -0700 Subject: [PATCH 034/223] adopt comments in #2473 --- sky/execution.py | 1 + sky/serve/controller.py | 8 ++++---- sky/serve/infra_providers.py | 8 ++++---- sky/serve/serve_utils.py | 36 ++++++++++++++++++++---------------- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index d28fa5514b8..f95dfd9a50c 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -972,6 +972,7 @@ def serve_up( Args: task: sky.Task to serve up. service_name: Name of the service. + controller_resources: The resources requirement for the controller. controller_best_resources: The optimized resources for the controller. """ controller_cluster_name = serve.generate_controller_cluster_name( diff --git a/sky/serve/controller.py b/sky/serve/controller.py index c557e915873..0599c107bee 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -24,10 +24,6 @@ # to inherit the setup from the `sky` logger. logger = sky_logging.init_logger('sky.serve.controller') -# Generate ssh key pair to avoid race condition when multiple sky.launch -# are executed at the same time. -authentication.get_or_generate_keys() - class SuppressSuccessGetAccessLogsFilter(logging.Filter): @@ -134,6 +130,10 @@ def terminate(request: fastapi.Request): required=True) args = parser.parse_args() + # Generate ssh key pair to avoid race condition when multiple sky.launch + # are executed at the same time. + authentication.get_or_generate_keys() + # ======= Infra Provider ========= service_spec = serve.SkyServiceSpec.from_yaml(args.task_yaml) _infra_provider = infra_providers.SkyPilotInfraProvider( diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 83d8ff7cfd0..0c706ac1ea5 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -198,8 +198,8 @@ def __init__( readiness_suffix: str, initial_delay_seconds: int, post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: - # TODO(tian): make this thread safe - self.replica_info: Dict[str, ReplicaInfo] = serve_utils.ThreadSafeDict() + self.replica_info: serve_utils.ThreadSafeDict[ + str, ReplicaInfo] = serve_utils.ThreadSafeDict() self.readiness_suffix: str = readiness_suffix self.initial_delay_seconds: int = initial_delay_seconds self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data @@ -250,9 +250,9 @@ def __init__(self, task_yaml_path: str, service_name: str, *args, self.task_yaml_path: str = task_yaml_path self.service_name: str = service_name self.next_replica_id: int = 1 - self.launch_process_pool: Dict[ + self.launch_process_pool: serve_utils.ThreadSafeDict[ str, subprocess.Popen] = serve_utils.ThreadSafeDict() - self.down_process_pool: Dict[ + self.down_process_pool: serve_utils.ThreadSafeDict[ str, subprocess.Popen] = serve_utils.ThreadSafeDict() self._start_process_pool_refresher() diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 9c2dff28084..3cc70b59b3e 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -7,7 +7,8 @@ import threading import time import typing -from typing import Any, Callable, Dict, Iterator, List, Optional, TextIO +from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, + TextIO, TypeVar) import colorama import requests @@ -30,41 +31,44 @@ '{replica_id}. Please use `sky serve status [SERVICE_ID]`' f' to check all valid replica id.{colorama.Style.RESET_ALL}') +KeyType = TypeVar('KeyType') +ValueType = TypeVar('ValueType') -class ThreadSafeDict(dict): + +class ThreadSafeDict(Generic[KeyType, ValueType]): """A thread-safe dict.""" - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, *args, **kwargs) -> None: + self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs) self._lock = threading.Lock() - def __getitem__(self, __key: Any) -> Any: + def __getitem__(self, key: KeyType) -> ValueType: with self._lock: - return super().__getitem__(__key) + return self._dict.__getitem__(key) - def __setitem__(self, __key: Any, __value: Any) -> None: + def __setitem__(self, key: KeyType, value: ValueType) -> None: with self._lock: - return super().__setitem__(__key, __value) + return self._dict.__setitem__(key, value) - def __delitem__(self, __key: Any) -> None: + def __delitem__(self, key: KeyType) -> None: with self._lock: - return super().__delitem__(__key) + return self._dict.__delitem__(key) def __len__(self) -> int: with self._lock: - return super().__len__() + return self._dict.__len__() - def __contains__(self, __key: object) -> bool: + def __contains__(self, key: KeyType) -> bool: with self._lock: - return super().__contains__(__key) + return self._dict.__contains__(key) def items(self): with self._lock: - return super().items() + return self._dict.items() def values(self): with self._lock: - return super().values() + return self._dict.values() def generate_controller_cluster_name(service_name: str) -> str: @@ -320,7 +324,7 @@ def _get_replica_status() -> status_lib.ReplicaStatus: # Notify user here to make sure user won't think the log is finished. print(f'{colorama.Fore.YELLOW}Start streaming logs for task job ' - f'of replica {replica_id}.{colorama.Style.RESET_ALL}') + f'of replica {replica_id}...{colorama.Style.RESET_ALL}') backend = backends.CloudVmRayBackend() # Always tail the logs of the first job, which represent user setup & run. From b288ba05e9beb69602bef1c36f1a7643a0341ffe Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Wed, 30 Aug 2023 17:38:07 -0700 Subject: [PATCH 035/223] Apply suggestions from code review Co-authored-by: Wei-Lin Chiang --- tests/skyserve/http/azure.yaml | 2 +- tests/skyserve/interrupt/service.yaml | 2 +- tests/test_smoke.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/skyserve/http/azure.yaml b/tests/skyserve/http/azure.yaml index 7bf5ac927ba..011e57deba5 100644 --- a/tests/skyserve/http/azure.yaml +++ b/tests/skyserve/http/azure.yaml @@ -4,7 +4,7 @@ resources: workdir: sky/serve/examples/http_server -# Use 8080 to test jupyterhub service is terminated +# Use 8081 to test jupyterhub service is terminated run: python3 server.py --port 8081 service: diff --git a/tests/skyserve/interrupt/service.yaml b/tests/skyserve/interrupt/service.yaml index e22b2e9994d..1f12e139ed1 100644 --- a/tests/skyserve/interrupt/service.yaml +++ b/tests/skyserve/interrupt/service.yaml @@ -5,7 +5,7 @@ resources: workdir: tests/skyserve/interrupt -setup: pip install fastapi[all] httpx uvicorn +setup: pip install fastapi[all] uvicorn run: python3 server.py --port 8080 diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 40fd729f118..0d99519ab58 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2770,7 +2770,7 @@ def test_skyserve_interrupt(): zone = 'us-central1-a' # Reference: test_spot_recovery_gcp - def terminate_cmd(replica_id: int) -> str: + def terminate_replica(replica_id: int) -> str: cluster_name = serve.generate_replica_cluster_name(name, replica_id) query_cmd = (f'gcloud compute instances list --filter=' f'"(labels.ray-cluster-name:{cluster_name})" ' From 7fccb8f58b35a0ba3382fa71fe70964b51a6b4cd Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 30 Aug 2023 17:48:27 -0700 Subject: [PATCH 036/223] Apply suggestions from code review --- sky/cli.py | 1 + sky/serve/load_balancer.py | 36 +++++++++---------- ...gorithms.py => load_balancing_policies.py} | 10 +++--- sky/serve/serve_utils.py | 18 +++++----- tests/skyserve/llm/get_response.py | 3 -- tests/skyserve/llm/prompt_output.json | 2 +- .../{interrupt => replica_failure}/server.py | 0 .../service.yaml | 0 .../test_round_robin.py | 0 tests/test_smoke.py | 16 ++++----- 10 files changed, 42 insertions(+), 44 deletions(-) rename sky/serve/{load_balancing_algorithms.py => load_balancing_policies.py} (90%) rename tests/skyserve/{interrupt => replica_failure}/server.py (100%) rename tests/skyserve/{interrupt => replica_failure}/service.yaml (100%) rename tests/skyserve/{interrupt => replica_failure}/test_round_robin.py (100%) diff --git a/sky/cli.py b/sky/cli.py index 2f8dc06e86b..fef3c096d94 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2135,6 +2135,7 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa f'\nTo cancel spot jobs, use: {bold}sky spot cancel [--all]{reset}') else: + assert cluster.startswith(serve_lib.CONTROLLER_PREFIX) error_str = ( 'Cancelling the sky serve controller\'s jobs is not allowed.') click.echo(error_str) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 9f7ebac5ee7..a73b8cb5eb6 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -9,42 +9,42 @@ from sky import sky_logging from sky.serve import constants -from sky.serve import load_balancing_algorithms +from sky.serve import load_balancing_policies # Use the explicit logger name so that the logger is under the # `sky.serve.load_balancer` namespace when executed directly, so as # to inherit the setup from the `sky` logger. logger = sky_logging.init_logger('sky.serve.load_balancer') -_LBAlgorithm = load_balancing_algorithms.LoadBalancingAlgorithm - class SkyServeLoadBalancer: """SkyServeLoadBalancer: redirect incoming traffic. This class accept any traffic to the controller and redirect it to the appropriate endpoint replica according to the load balancing - algorithm. + policy. """ - def __init__(self, controller_url: str, port: int, - load_balancing_algorithm: _LBAlgorithm) -> None: + def __init__( + self, controller_url: str, port: int, + load_balancing_policy: load_balancing_policies.LoadBalancingPolicy + ) -> None: self.app = fastapi.FastAPI() self.controller_url = controller_url self.port = port - self.lb_algorithm = load_balancing_algorithm + self.load_balancing_policy = load_balancing_policy for i in range(3): resp = requests.get(self.controller_url + '/controller/get_autoscaler_query_interval') if resp.status_code == 200: - self.lb_algorithm.set_query_interval( + self.load_balancing_policy.set_query_interval( resp.json()['query_interval']) break if i == 2: logger.error('Failed to get autoscaler query interval. ' 'Use default interval instead.') - self.lb_algorithm.set_query_interval(None) + self.load_balancing_policy.set_query_interval(None) time.sleep(10) def _sync_with_controller(self): @@ -55,8 +55,8 @@ def _sync_with_controller(self): response = session.post( self.controller_url + '/controller/update_num_requests', json={ - 'num_requests': - self.lb_algorithm.deprecate_old_requests() + 'num_requests': self.load_balancing_policy. + deprecate_old_requests() }, timeout=5) response.raise_for_status() @@ -69,12 +69,13 @@ def _sync_with_controller(self): print(f'An error occurred: {e}') else: logger.info(f'Available Replica IPs: {ready_replicas}') - self.lb_algorithm.set_ready_replicas(ready_replicas) + self.load_balancing_policy.set_ready_replicas( + ready_replicas) time.sleep(constants.CONTROLLER_SYNC_INTERVAL) async def _redirect_handler(self, request: fastapi.Request): - self.lb_algorithm.increment_request_count(1) - replica_ip = self.lb_algorithm.select_replica(request) + self.load_balancing_policy.increment_request_count(1) + replica_ip = self.load_balancing_policy.select_replica(request) if replica_ip is None: raise fastapi.HTTPException(status_code=503, @@ -119,13 +120,12 @@ def run(self): required=True) args = parser.parse_args() - # ======= Load Balancing Algorithm ========= - _load_balancing_algorithm = ( - load_balancing_algorithms.RoundRobinLoadBalancingAlgorithm()) + # ======= Load Balancing Policy ========= + _load_balancing_policy = load_balancing_policies.RoundRobinPolicy() # ======= SkyServeLoadBalancer ========= load_balancer = SkyServeLoadBalancer( controller_url=args.controller_addr, port=args.port, - load_balancing_algorithm=_load_balancing_algorithm) + load_balancing_policy=_load_balancing_policy) load_balancer.run() diff --git a/sky/serve/load_balancing_algorithms.py b/sky/serve/load_balancing_policies.py similarity index 90% rename from sky/serve/load_balancing_algorithms.py rename to sky/serve/load_balancing_policies.py index b0c943b757e..ba55650697c 100644 --- a/sky/serve/load_balancing_algorithms.py +++ b/sky/serve/load_balancing_policies.py @@ -1,4 +1,4 @@ -"""LoadBalancingAlgorithms: algorithm to select endpoint.""" +"""LoadBalancingPolicy: Policy to select endpoint.""" from collections import deque import logging import time @@ -11,8 +11,8 @@ _DEFAULT_QUERY_INTERVAL = 60 -class LoadBalancingAlgorithm: - """Abstract class for load balancing algorithms.""" +class LoadBalancingPolicy: + """Abstract class for load balancing policies.""" def __init__(self) -> None: self.ready_replicas: Set[str] = set() @@ -49,8 +49,8 @@ def select_replica(self, request: fastapi.Request) -> Optional[str]: raise NotImplementedError -class RoundRobinLoadBalancingAlgorithm(LoadBalancingAlgorithm): - """Round-robin load balancing algorithm.""" +class RoundRobinPolicy(LoadBalancingPolicy): + """Round-robin load balancing policy.""" def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 3cc70b59b3e..08a807f65c6 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -201,7 +201,7 @@ def _follow_logs(file: TextIO, cluster_name: str, *, finish_stream: Callable[[], bool], - exit_when_no_new_content: bool = False, + exit_if_stream_end: bool = False, no_new_content_timeout: Optional[int] = None) -> Iterator[str]: line = '' log_file = None @@ -240,17 +240,17 @@ def cluster_is_up() -> bool: # We still exit if more than 10 seconds without new # content to avoid any internal bug that causes # the launch failed and cluster status remains INIT. - for l in _follow_logs(f, - cluster_name, - finish_stream=cluster_is_up, - exit_when_no_new_content= - exit_when_no_new_content, - no_new_content_timeout=10): + for l in _follow_logs( + f, + cluster_name, + finish_stream=cluster_is_up, + exit_if_stream_end=exit_if_stream_end, + no_new_content_timeout=10): yield l log_file = None line = '' else: - if exit_when_no_new_content or finish_stream(): + if exit_if_stream_end or finish_stream(): break if no_new_content_timeout is not None: if no_new_content_cnt >= no_new_content_timeout: @@ -315,7 +315,7 @@ def _get_replica_status() -> status_lib.ReplicaStatus: for line in _follow_logs(f, replica_cluster_name, finish_stream=finish_stream, - exit_when_no_new_content=not follow): + exit_if_stream_end=not follow): print(line, end='', flush=True) if not follow and _get_replica_status( ) == status_lib.ReplicaStatus.PROVISIONING: diff --git a/tests/skyserve/llm/get_response.py b/tests/skyserve/llm/get_response.py index fc96d0e4a9d..f0fa530effc 100644 --- a/tests/skyserve/llm/get_response.py +++ b/tests/skyserve/llm/get_response.py @@ -27,7 +27,4 @@ 'temperature': 0, }) - if 'choices' not in resp.json(): - with open('/home/txia/skypilot/@temp/temp.py', 'a') as f: - f.write(f'Failed: {resp.json()}\n') print(resp.json()['choices'][0]['message']['content']) diff --git a/tests/skyserve/llm/prompt_output.json b/tests/skyserve/llm/prompt_output.json index 778cd8a0d35..e3ff18d5c1c 100644 --- a/tests/skyserve/llm/prompt_output.json +++ b/tests/skyserve/llm/prompt_output.json @@ -2,4 +2,4 @@ "Introduce yourself!": "Hello! My name is Alex. I'm an AI language model here to assist you with any questions or tasks you may have. How can I help you today?", "What is Deep Learning? Explain in less than 30 words.": "Deep Learning is a type of machine learning that uses multiple layers of complex algorithms to learn from data.", "Write a twitter post about skypilot, a framework for running LLMs, AI, and batch jobs on any cloud.": "\"Skypilot is a framework for running machine learning models, AI, and batch jobs on any cloud. With Skypilot, you can easily deploy and manage your machine learning models, AI models, and batch jobs across multiple cloud providers. Say goodbye to the hassle of managing multiple cloud providers and hello to a single platform for all your machine learning and AI needs. Try Skypilot today!\"" -} \ No newline at end of file +} diff --git a/tests/skyserve/interrupt/server.py b/tests/skyserve/replica_failure/server.py similarity index 100% rename from tests/skyserve/interrupt/server.py rename to tests/skyserve/replica_failure/server.py diff --git a/tests/skyserve/interrupt/service.yaml b/tests/skyserve/replica_failure/service.yaml similarity index 100% rename from tests/skyserve/interrupt/service.yaml rename to tests/skyserve/replica_failure/service.yaml diff --git a/tests/skyserve/interrupt/test_round_robin.py b/tests/skyserve/replica_failure/test_round_robin.py similarity index 100% rename from tests/skyserve/interrupt/test_round_robin.py rename to tests/skyserve/replica_failure/test_round_robin.py diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 0d99519ab58..6eedafb726d 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2764,7 +2764,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: @pytest.mark.gcp -def test_skyserve_interrupt(): +def test_skyserve_replica_failure(): """Test skyserve with manually interrupting some replica""" name = _get_service_name() zone = 'us-central1-a' @@ -2779,28 +2779,28 @@ def terminate_replica(replica_id: int) -> str: f' --quiet $({query_cmd})') test = Test( - f'test-skyserve-interrupt', + f'test-skyserve-replica-failure', [ - f'sky serve up -n {name} -y tests/skyserve/interrupt/service.yaml', + f'sky serve up -n {name} -y tests/skyserve/replica_failure/service.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3), f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 1)}; ' f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; ' - 'python tests/skyserve/interrupt/test_round_robin.py ' + 'python tests/skyserve/replica_failure/test_round_robin.py ' '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3', - terminate_cmd(1), + terminate_replica(1), f'sleep {serve.CONTROLLER_SYNC_INTERVAL}', f'sky serve status {name} | grep 2/3', f'{_get_replica_line(name, 1)} | grep NOT_READY', f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 2)}; ' f'{_get_replica_ip(name, 3)}; ' - 'python tests/skyserve/interrupt/test_round_robin.py ' + 'python tests/skyserve/replica_failure/test_round_robin.py ' '--endpoint $endpoint --replica-num 2 --replica-ips $ip2 $ip3', - terminate_cmd(2), + terminate_replica(2), f'sleep {serve.CONTROLLER_SYNC_INTERVAL}', f'sky serve status {name} | grep 1/3', f'{_get_replica_line(name, 2)} | grep NOT_READY', f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 3)}; ' - 'python tests/skyserve/interrupt/test_round_robin.py ' + 'python tests/skyserve/replica_failure/test_round_robin.py ' '--endpoint $endpoint --replica-num 1 --replica-ips $ip3', ], f'sky serve down -y {name}', From 58a11226de0443536387a7c44dac0eeea69fc3c9 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Mon, 4 Sep 2023 16:09:36 -0700 Subject: [PATCH 037/223] fix example --- sky/serve/examples/vicuna-v1.5.yaml | 4 ++-- sky/serve/examples/vllm.yaml | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sky/serve/examples/vicuna-v1.5.yaml b/sky/serve/examples/vicuna-v1.5.yaml index a46f75b49ca..d36b571e6b4 100644 --- a/sky/serve/examples/vicuna-v1.5.yaml +++ b/sky/serve/examples/vicuna-v1.5.yaml @@ -19,8 +19,8 @@ setup: | fi # Install dependencies - pip install git+https://github.com/lm-sys/FastChat.git - pip install transformers torch accelerate sentencepiece + pip install "fschat[model_worker,webui]==0.2.24" + pip install protobuf run: | conda activate chatbot diff --git a/sky/serve/examples/vllm.yaml b/sky/serve/examples/vllm.yaml index 7fa784f075f..e74b546b0f0 100644 --- a/sky/serve/examples/vllm.yaml +++ b/sky/serve/examples/vllm.yaml @@ -17,9 +17,8 @@ setup: | # Setup the environment conda create -n chatbot python=3.10 -y conda activate chatbot - pip install git+https://github.com/lm-sys/FastChat.git - pip install vllm - pip install accelerate + pip install "fschat[model_worker,webui]==0.2.24" + pip install vllm accelerate protobuf fi run: | From 7aef306ca4d141280f8fb2fed972c6bf991e9397 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Mon, 4 Sep 2023 22:36:14 -0700 Subject: [PATCH 038/223] Fix serve probe (#2513) reset --- sky/serve/infra_providers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 0c706ac1ea5..3ad341aae66 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -611,6 +611,7 @@ def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: info = self.replica_info[cluster_name] info.status_property.service_ready_now = res if res: + info.consecutive_failure_cnt = 0 if not info.status_property.service_once_ready: info.status_property.service_once_ready = True continue From d05b42ff924c1a17098bb75a880df43193b899c5 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 5 Sep 2023 21:12:34 -0700 Subject: [PATCH 039/223] [SkyServe] Add option to auto restart (#2518) * add auto restart * add smoke test * add task yaml for smoke test * Apply suggestions from code review Co-authored-by: Wei-Lin Chiang * apply suggestions from code review --------- Co-authored-by: Wei-Lin Chiang --- sky/serve/autoscalers.py | 5 +++- sky/serve/controller.py | 1 + sky/serve/infra_providers.py | 14 +++++++---- sky/serve/service_spec.py | 11 +++++++++ sky/utils/schemas.py | 3 +++ tests/skyserve/auto_restart.yaml | 19 +++++++++++++++ tests/test_smoke.py | 40 ++++++++++++++++++++++++++++++++ 7 files changed, 87 insertions(+), 6 deletions(-) create mode 100644 tests/skyserve/auto_restart.yaml diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index bc712aaa696..433e0a75a2d 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -20,10 +20,12 @@ class Autoscaler: def __init__(self, infra_provider: infra_providers.InfraProvider, + auto_restart: bool, frequency: int, min_nodes: int = 1, max_nodes: Optional[int] = None) -> None: self.infra_provider = infra_provider + self.auto_restart = auto_restart self.min_nodes: int = min_nodes # Default to fixed node, i.e. min_nodes == max_nodes. self.max_nodes: int = max_nodes or min_nodes @@ -99,7 +101,8 @@ def get_query_interval(self) -> int: def evaluate_scaling(self) -> None: current_time = time.time() - num_nodes = self.infra_provider.total_replica_num() + num_nodes = self.infra_provider.total_replica_num( + count_failed_replica=not self.auto_restart) # Check if cooldown period has passed since the last scaling operation. # Only cooldown if bootstrapping is done. diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 0599c107bee..637c6cb3231 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -146,6 +146,7 @@ def terminate(request: fastapi.Request): # ======= Autoscaler ========= _autoscaler = autoscalers.RequestRateAutoscaler( _infra_provider, + auto_restart=service_spec.auto_restart, frequency=20, min_nodes=service_spec.min_replicas, max_nodes=service_spec.max_replicas, diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 3ad341aae66..c7419c9d524 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -215,9 +215,8 @@ def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: def get_uptime(self) -> Optional[float]: return self.uptime - def total_replica_num(self) -> int: - # Returns the total number of replicas, including those under - # provisioning and deletion + def total_replica_num(self, count_failed_replica: bool) -> int: + # Returns the total number of replicas raise NotImplementedError def get_ready_replicas(self) -> Set[str]: @@ -389,8 +388,13 @@ def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: for info in self.replica_info.values() ] - def total_replica_num(self) -> int: - return len(self.replica_info) + def total_replica_num(self, count_failed_replica: bool) -> int: + if count_failed_replica: + return len(self.replica_info) + return len([ + i for i in self.replica_info.values() + if i.status != status_lib.ReplicaStatus.FAILED + ]) def get_ready_replicas(self) -> Set[str]: ready_replicas = set() diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 0019aebc8d1..6fccaa4bff2 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -26,6 +26,7 @@ def __init__( qps_lower_threshold: Optional[float] = None, post_data: Optional[Dict[str, Any]] = None, controller_resources: Optional[Dict[str, Any]] = None, + auto_restart: bool = False, ): if min_replicas < 0: with ux_utils.print_exception_no_traceback(): @@ -60,6 +61,7 @@ def __init__( self._qps_lower_threshold = qps_lower_threshold self._post_data = post_data self._controller_resources = controller_resources + self._auto_restart = auto_restart @staticmethod def from_yaml_config(config: Optional[Dict[str, Any]]): @@ -113,6 +115,7 @@ def from_yaml_config(config: Optional[Dict[str, Any]]): service_config['max_replicas'] = None service_config['qps_upper_threshold'] = None service_config['qps_lower_threshold'] = None + service_config['auto_restart'] = False else: service_config['min_replicas'] = policy_section['min_replicas'] service_config['max_replicas'] = policy_section.get( @@ -121,6 +124,8 @@ def from_yaml_config(config: Optional[Dict[str, Any]]): 'qps_upper_threshold', None) service_config['qps_lower_threshold'] = policy_section.get( 'qps_lower_threshold', None) + service_config['auto_restart'] = policy_section.get( + 'auto_restart', False) service_config['controller_resources'] = config.pop( 'controller_resources', None) @@ -174,6 +179,7 @@ def add_if_not_none(section, key, value, no_empty: bool = False): self.qps_lower_threshold) add_if_not_none('controller_resources', None, self._controller_resources) + add_if_not_none('auto_restart', None, self._auto_restart) return config @@ -196,6 +202,7 @@ def __repr__(self) -> str: Readiness probe method: {self.probe_str()} Replica autoscaling policy: {self.policy_str()} Service initial delay seconds: {self.initial_delay_seconds} + Replica auto restart: {self.auto_restart} Please refer to SkyPilot Serve document for detailed explanations. """) @@ -239,3 +246,7 @@ def post_data(self) -> Optional[Dict[str, Any]]: @property def controller_resources(self) -> Optional[Dict[str, Any]]: return self._controller_resources + + @property + def auto_restart(self) -> bool: + return self._auto_restart diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 3c664615efb..3dd4703a184 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -199,6 +199,9 @@ def get_service_schema(): 'qps_lower_threshold': { 'type': 'number', }, + 'auto_restart': { + 'type': 'boolean', + }, } }, 'replicas': { diff --git a/tests/skyserve/auto_restart.yaml b/tests/skyserve/auto_restart.yaml new file mode 100644 index 00000000000..ba5affe4d2a --- /dev/null +++ b/tests/skyserve/auto_restart.yaml @@ -0,0 +1,19 @@ +resources: + cloud: gcp + zone: us-central1-a + cpus: 2+ + +workdir: sky/serve/examples/http_server + +run: python3 server.py --port 8080 + +service: + port: 8080 + readiness_probe: + path: /health + initial_delay_seconds: 20 + replica_policy: + min_replicas: 1 + auto_restart: true + controller_resources: + cloud: gcp diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 6eedafb726d..3f73f2c5cec 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2809,6 +2809,46 @@ def terminate_replica(replica_id: int) -> str: run_one_test(test) +@pytest.mark.gcp +def test_skyserve_auto_restart(): + """Test skyserve with auto restart""" + name = _get_service_name() + zone = 'us-central1-a' + + # Reference: test_spot_recovery_gcp + def terminate_replica(replica_id: int) -> str: + cluster_name = serve.generate_replica_cluster_name(name, replica_id) + query_cmd = (f'gcloud compute instances list --filter=' + f'"(labels.ray-cluster-name:{cluster_name})" ' + f'--zones={zone} --format="value(name)"') + return (f'gcloud compute instances delete --zone={zone}' + f' --quiet $({query_cmd})') + + test = Test( + f'test-skyserve-auto-restart', + [ + # TODO(tian): we can dynamically generate YAML from template to + # avoid maintaining too many YAML files + f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', + terminate_replica(1), + 'sleep 180', # Wait for consecutive failure timeout passed. + # Currently failed replica will still count as replica num in sky serve status. + # TODO(tian): Fix this in the future. + '(while true; do' + f' output=$(sky serve status {name});' + ' echo "$output" | grep -q "1/2" && break;' + ' sleep 10;' + f'done); sleep {serve.CONTROLLER_SYNC_INTERVAL};', + f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', + ], + f'sky serve down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + # ------- Testing user ray cluster -------- @pytest.mark.no_kubernetes # Kubernetes does not support sky status -r yet. def test_user_ray_cluster(generic_cloud: str): From 5200fbdccfe32718c52f37fac7551b8b212839e5 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 5 Sep 2023 21:57:43 -0700 Subject: [PATCH 040/223] [SkyServe] Fix auto restart (#2521) * add auto restart * add smoke test * add task yaml for smoke test * Apply suggestions from code review Co-authored-by: Wei-Lin Chiang * apply suggestions from code review * fix * fix --------- Co-authored-by: Wei-Lin Chiang --- sky/serve/service_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 6fccaa4bff2..6e2867e6562 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -177,9 +177,9 @@ def add_if_not_none(section, key, value, no_empty: bool = False): self.qps_upper_threshold) add_if_not_none('replica_policy', 'qps_lower_threshold', self.qps_lower_threshold) + add_if_not_none('replica_policy', 'auto_restart', self._auto_restart) add_if_not_none('controller_resources', None, self._controller_resources) - add_if_not_none('auto_restart', None, self._auto_restart) return config From 45e43c3b86ea292cae7ba8630fba6d2eba4aa95d Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sat, 9 Sep 2023 15:52:06 -0700 Subject: [PATCH 041/223] [SkyServe] Using Multi-service controller (#2489) * finish * remove finished TODO * remove task debug * fix cli flags * fix cli * format * filter our not UP controller * add --new-controller * add autodown * not expose controller * fix * early exit when job is pending * fix typo * add pending cnt * move get ports to serve_utils * early store service handle * move controller_port to infraprovider * add constants * format * format * format * add autodown hint for skyservecontroller * add todo * fix bug * extent timeout * broad except * group by services * fix bug * endpoint -> endpoint ip; disable user controller port * remove local replica infoo; move controller cluster name to DB; add service nam * auto switch to new controller * fix & add TODO * nit * Apply suggestions from code review Co-authored-by: Wei-Lin Chiang * Apply suggestions from code review * prototype. todo: debug * fix some bugs * finish most of them. TODO: merge all jobs into one * fix * merge all jobs into serve-controller.yaml * add service name precheck * minor * make python API more robust * apply suggestion from code review * aggregate env vars for controller * rename record * upd comments * Update sky/execution.py Co-authored-by: Wei-Lin Chiang * change to set, format * extract _get_service_num_on_controller into another function * use autostop * set task default vCPU to 0.125 * minor * cleanup utility files after sky serve down * fix spot controller * Update sky/execution.py Co-authored-by: Wei-Lin Chiang * apply suggestions from code review * Update sky/cli.py Co-authored-by: Wei-Lin Chiang * Update sky/backends/cloud_vm_ray_backend.py Co-authored-by: Wei-Lin Chiang * apply suggestion from code review --------- Co-authored-by: Wei-Lin Chiang --- sky/backends/backend.py | 4 +- sky/backends/backend_utils.py | 78 ++-- sky/backends/cloud_vm_ray_backend.py | 71 ++-- sky/backends/local_docker_backend.py | 13 +- sky/cli.py | 110 +++--- sky/core.py | 50 ++- sky/execution.py | 383 +++++++++++--------- sky/global_user_state.py | 24 +- sky/serve/__init__.py | 19 +- sky/serve/constants.py | 39 +- sky/serve/controller.py | 7 +- sky/serve/infra_providers.py | 29 +- sky/serve/load_balancer.py | 49 ++- sky/serve/serve_utils.py | 403 ++++++++++++++++----- sky/serve/service_spec.py | 6 - sky/task.py | 3 + sky/templates/sky-serve-controller.yaml.j2 | 21 +- sky/templates/spot-controller.yaml.j2 | 20 +- sky/utils/cli_utils/status_utils.py | 21 +- 19 files changed, 909 insertions(+), 441 deletions(-) diff --git a/sky/backends/backend.py b/sky/backends/backend.py index 28aa981b078..c5e76db026f 100644 --- a/sky/backends/backend.py +++ b/sky/backends/backend.py @@ -86,7 +86,7 @@ def execute(self, handle: _ResourceHandleType, task: 'task_lib.Task', detach_run: bool, - dryrun: bool = False) -> None: + dryrun: bool = False) -> Optional[int]: usage_lib.record_cluster_name_for_current_operation( handle.get_cluster_name()) usage_lib.messages.usage.update_actual_task(task) @@ -143,7 +143,7 @@ def _execute(self, handle: _ResourceHandleType, task: 'task_lib.Task', detach_run: bool, - dryrun: bool = False) -> None: + dryrun: bool = False) -> Optional[int]: raise NotImplementedError def _post_execute(self, handle: _ResourceHandleType, down: bool) -> None: diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 78d0c81e969..b320b227d55 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,5 +1,6 @@ """Util constants/functions for the backends.""" import collections +import copy from datetime import datetime import difflib import enum @@ -2636,35 +2637,45 @@ def _refresh_service_record_no_lock( A tuple of a possibly updated record and an error message if any error occurred when refreshing the service. """ - record = global_user_state.get_service_from_name(service_name) - if record is None: + local_record = global_user_state.get_service_from_name(service_name) + if local_record is None: return None, None - service_handle: serve_lib.ServiceHandle = record['handle'] + + # We use a copy of the record with default value of replica_info to return + # when there is an error. + record = copy.deepcopy(local_record) + record['replica_info'] = [] + service_handle: serve_lib.ServiceHandle = local_record['handle'] try: check_network_connection() except exceptions.NetworkError: return record, 'Failed to refresh replica info due to network error.' - if not service_handle.endpoint: + if not service_handle.endpoint_ip: # Service controller is still initializing. Skipped refresh status. return record, None - controller_cluster_name = service_handle.controller_cluster_name - cluster_record = global_user_state.get_cluster_from_name( - controller_cluster_name) - if (cluster_record is None or - cluster_record['status'] != status_lib.ClusterStatus.UP): + controller_name = local_record['controller_name'] + cluster_record = global_user_state.get_cluster_from_name(controller_name) + + # We don't check controller status here since it might be in INIT status + # when other services is starting up and launching the controller. + if cluster_record is None: global_user_state.set_service_status( service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - return record, (f'Controller cluster {controller_cluster_name!r} ' - 'is not found or UP.') + return record, (f'Controller cluster {controller_name!r} ' + 'is not found.') handle = cluster_record['handle'] backend = get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) - code = serve_lib.ServeCodeGen.get_latest_info() + if service_handle.controller_port is None: + return record, 'Controller task is not successfully launched.' + + code = serve_lib.ServeCodeGen.get_latest_info( + service_handle.controller_port) returncode, latest_info_payload, stderr = backend.run_on_head( handle, code, @@ -2676,19 +2687,25 @@ def _refresh_service_record_no_lock( f'Using the cached record. Reason: {stderr}') latest_info = serve_lib.load_latest_info(latest_info_payload) - service_handle.replica_info = latest_info['replica_info'] service_handle.uptime = latest_info['uptime'] # When the service is shutting down, there is a period of time which the # controller still responds to the request, and the replica is not # terminated, so the return value for _service_status_from_replica_info # will still be READY, but we don't want change service status to READY. - if record['status'] != status_lib.ServiceStatus.SHUTTING_DOWN: - record['status'] = _service_status_from_replica_info( + # For controller init, there is a small chance that the controller is + # running but the load balancer is not. In this case, the service status + # shouldn't be refreshed too. + if local_record['status'] not in [ + status_lib.ServiceStatus.SHUTTING_DOWN, + status_lib.ServiceStatus.CONTROLLER_INIT, + ]: + local_record['status'] = _service_status_from_replica_info( latest_info['replica_info']) - global_user_state.add_or_update_service(**record) - return record, None + global_user_state.add_or_update_service(**local_record) + local_record['replica_info'] = latest_info['replica_info'] + return local_record, None def _refresh_service_record( @@ -2706,6 +2723,8 @@ def _refresh_service_record( return global_user_state.get_service_from_name(service_name), msg +# TODO(tian): Maybe aggregate services using same controller to reduce SSH +# overhead? def refresh_service_status(service_name: Optional[str]) -> List[Dict[str, Any]]: if service_name is None: service_names = [ @@ -2778,10 +2797,16 @@ def get_backend_from_handle( return backend -def get_task_demands_dict(task: 'task_lib.Task') -> Optional[Dict[str, float]]: - """Returns the accelerator dict of the task""" - # TODO: CPU and other memory resources are not supported yet. - accelerator_dict = None +def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: + """Returns the resources dict of the task""" + # TODO: CPU and other memory resources are not supported yet + # except for sky serve controller task. + resources_dict = { + # We set CPU resource for sky serve controller to a smaller value + # to support a larger number of services. + 'CPU': (serve_lib.SERVICES_TASK_CPU_DEMAND if + task.is_sky_serve_controller_task else DEFAULT_TASK_CPU_DEMAND) + } if task.best_resources is not None: resources = task.best_resources else: @@ -2789,17 +2814,14 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Optional[Dict[str, float]]: # sky.optimize(), so best_resources may be None. assert len(task.resources) == 1, task.resources resources = list(task.resources)[0] - if resources is not None: - accelerator_dict = resources.accelerators - return accelerator_dict + if resources is not None and resources.accelerators is not None: + resources_dict.update(resources.accelerators) + return resources_dict def get_task_resources_str(task: 'task_lib.Task') -> str: resources_dict = get_task_demands_dict(task) - if resources_dict is None: - resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}' - else: - resources_str = ', '.join(f'{k}:{v}' for k, v in resources_dict.items()) + resources_str = ', '.join(f'{k}:{v}' for k, v in resources_dict.items()) resources_str = f'{task.num_nodes}x [{resources_str}]' return resources_str diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index b16ace2ab5a..59cff03ffd3 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -287,7 +287,7 @@ def add_prologue(self, job_id: int, is_local: bool = False) -> None: def add_gang_scheduling_placement_group_and_setup( self, num_nodes: int, - accelerator_dict: Optional[Dict[str, float]], + resources_dict: Dict[str, float], stable_cluster_internal_ips: List[str], setup_cmd: Optional[str] = None, setup_log_path: Optional[str] = None, @@ -305,16 +305,18 @@ def add_gang_scheduling_placement_group_and_setup( self._has_gang_scheduling = True self._num_nodes = num_nodes + task_cpu_demand = resources_dict.pop('CPU') # Set CPU to avoid ray hanging the resources allocation # for remote functions, since the task will request 1 CPU # by default. - bundles = [{ - 'CPU': backend_utils.DEFAULT_TASK_CPU_DEMAND - } for _ in range(num_nodes)] + bundles = [{'CPU': task_cpu_demand} for _ in range(num_nodes)] - if accelerator_dict is not None: - acc_name = list(accelerator_dict.keys())[0] - acc_count = list(accelerator_dict.values())[0] + if len(resources_dict) > 0: + assert len(resources_dict) == 1, \ + ('There can only be one type of accelerator per instance.' + f' Found: {resources_dict}.') + acc_name = list(resources_dict.keys())[0] + acc_count = list(resources_dict.values())[0] gpu_dict = {'GPU': acc_count} # gpu_dict should be empty when the accelerator is not GPU. # FIXME: This is a hack to make sure that we do not reserve @@ -323,7 +325,7 @@ def add_gang_scheduling_placement_group_and_setup( gpu_dict = {} for bundle in bundles: bundle.update({ - **accelerator_dict, + **resources_dict, # Set the GPU to avoid ray hanging the resources allocation **gpu_dict, }) @@ -416,7 +418,7 @@ def check_ip(): return ray.util.get_node_ip_address() gang_scheduling_id_to_ip = ray.get([ check_ip.options( - num_cpus={backend_utils.DEFAULT_TASK_CPU_DEMAND}, + num_cpus={task_cpu_demand}, scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy( placement_group=pg, placement_group_bundle_index=i @@ -454,7 +456,7 @@ def add_ray_task(self, bash_script: Optional[str], task_name: Optional[str], job_run_id: Optional[str], - ray_resources_dict: Optional[Dict[str, float]], + ray_resources_dict: Dict[str, float], log_dir: str, env_vars: Optional[Dict[str, str]] = None, gang_scheduling_id: int = 0, @@ -466,14 +468,15 @@ def add_ray_task(self, assert (not self._has_register_run_fn or bash_script is None), ('bash_script should ' 'be None when run_fn is registered.') + task_cpu_demand = ray_resources_dict.pop('CPU') # Build remote_task.options(...) # resources=... # num_gpus=... options = [] - options.append(f'num_cpus={backend_utils.DEFAULT_TASK_CPU_DEMAND}') + options.append(f'num_cpus={task_cpu_demand}') num_gpus = 0.0 - if ray_resources_dict is not None: + if len(ray_resources_dict) > 0: assert len(ray_resources_dict) == 1, \ ('There can only be one type of accelerator per instance.' f' Found: {ray_resources_dict}.') @@ -3334,10 +3337,15 @@ def _execute( task: task_lib.Task, detach_run: bool, dryrun: bool = False, - ) -> None: + ) -> Optional[int]: + """Execute a job on the cluster. + + Returns: + The job id if the job is submitted successfully, None otherwise. + """ if task.run is None: logger.info('Run commands not specified or empty.') - return + return None # Check the task resources vs the cluster resources. Since `sky exec` # will not run the provision and _check_existing_cluster self.check_resources_fit_cluster(handle, task) @@ -3347,7 +3355,7 @@ def _execute( if dryrun: logger.info(f'Dryrun complete. Would have run:\n{task}') - return + return None job_id = self._add_job(handle, task.name, resources_str) @@ -3358,6 +3366,7 @@ def _execute( else: # Case: task_lib.Task(run, num_nodes=1) self._execute_task_one_node(handle, task, job_id, detach_run) + return job_id def _post_execute(self, handle: CloudVmRayResourceHandle, down: bool) -> None: @@ -3676,10 +3685,24 @@ def tail_spot_logs(self, ) def tail_serve_logs(self, handle: CloudVmRayResourceHandle, - service_name: str, replica_id: int, - follow: bool) -> None: - code = serve_lib.ServeCodeGen.stream_logs(service_name, replica_id, - follow) + service_handle: serve_lib.ServiceHandle, + controller: bool, load_balancer: bool, + replica_id: Optional[int], follow: bool) -> None: + if controller or load_balancer: + code = serve_lib.ServeCodeGen.stream_serve_process_logs( + service_handle.service_name, + stream_controller=controller, + follow=follow) + else: + if service_handle.controller_port is None: + logger.warning('Controller task is not successfully launched ' + f'for service {service_handle.service_name!r}. ' + 'Cannot stream logs.') + return + assert replica_id is not None, service_handle + code = serve_lib.ServeCodeGen.stream_replica_logs( + service_handle.service_name, service_handle.controller_port, + replica_id, follow) signal.signal(signal.SIGINT, backend_utils.interrupt_handler) signal.signal(signal.SIGTSTP, backend_utils.stop_handler) @@ -4506,7 +4529,7 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, # Launch the command as a Ray task. log_dir = os.path.join(self.log_dir, 'tasks') - accelerator_dict = backend_utils.get_task_demands_dict(task) + resources_dict = backend_utils.get_task_demands_dict(task) internal_ips = handle.internal_ips() assert internal_ips is not None, 'internal_ips is not cached in handle' @@ -4515,7 +4538,7 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, codegen.add_prologue(job_id, is_local=is_local) codegen.add_gang_scheduling_placement_group_and_setup( 1, - accelerator_dict, + resources_dict, stable_cluster_internal_ips=internal_ips, setup_cmd=self._setup_cmd, setup_log_path=os.path.join(log_dir, 'setup.log'), @@ -4564,7 +4587,7 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, # submit _run_cmd(cmd) with resource {node_i: 1} log_dir_base = self.log_dir log_dir = os.path.join(log_dir_base, 'tasks') - accelerator_dict = backend_utils.get_task_demands_dict(task) + resources_dict = backend_utils.get_task_demands_dict(task) internal_ips = handle.internal_ips() assert internal_ips is not None, 'internal_ips is not cached in handle' @@ -4582,7 +4605,7 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, codegen.add_prologue(job_id, is_local=is_local) codegen.add_gang_scheduling_placement_group_and_setup( num_actual_nodes, - accelerator_dict, + resources_dict, stable_cluster_internal_ips=internal_ips, setup_cmd=self._setup_cmd, setup_log_path=os.path.join(log_dir, 'setup.log'), @@ -4614,7 +4637,7 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, env_vars=task.envs, task_name=task.name, job_run_id=job_run_id, - ray_resources_dict=accelerator_dict, + ray_resources_dict=resources_dict, log_dir=log_dir, gang_scheduling_id=i, use_sudo=use_sudo, diff --git a/sky/backends/local_docker_backend.py b/sky/backends/local_docker_backend.py index 3f95f1acf22..df402500164 100644 --- a/sky/backends/local_docker_backend.py +++ b/sky/backends/local_docker_backend.py @@ -269,9 +269,13 @@ def _execute(self, handle: LocalDockerResourceHandle, task: 'task_lib.Task', detach_run: bool, - dryrun: bool = False) -> None: - """ Launches the container.""" + dryrun: bool = False) -> Optional[int]: + """ Launches the container. + Returns: + The job id if the job is submitted successfully. LocalDockerBackend + does not have the concept of job id, so this is always None. + """ if detach_run: raise NotImplementedError('detach_run=True is not supported in ' 'LocalDockerBackend.') @@ -284,13 +288,14 @@ def _execute(self, # Handle a basic task if task.run is None: logger.info(f'Nothing to run; run command not specified:\n{task}') - return + return None if dryrun: logger.info(f'Dryrun complete. Would have run:\n{task}') - return + return None self._execute_task_one_node(handle, task) + return None def _post_execute(self, handle: LocalDockerResourceHandle, down: bool) -> None: diff --git a/sky/cli.py b/sky/cli.py index fef3c096d94..39109bb795b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1770,7 +1770,8 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]): if skyserve_controllers: click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}\n' f'SkyServe Controllers{colorama.Style.RESET_ALL}') - status_utils.show_status_table(skyserve_controllers, all) + num_pending_autostop += status_utils.show_status_table( + skyserve_controllers, all) hints.append( f'* To see detailed service status: {colorama.Style.BRIGHT}' f'sky serve status{colorama.Style.RESET_ALL}') @@ -2672,6 +2673,24 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): 'to terminate (see caveats above).') +def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): + services = global_user_state.get_services_from_controller_name( + controller_name) + if services: + service_names = [service['name'] for service in services] + with ux_utils.print_exception_no_traceback(): + plural = '' if len(service_names) == 1 else 's' + raise exceptions.NotSupportedError( + f'{colorama.Fore.RED}Tearing down the sky serve controller ' + f'is not supported, as it is currently serving the following ' + f'service{plural}: {", ".join(service_names)}. Please teardown ' + f'the service{plural} first with {colorama.Style.BRIGHT}sky ' + f'serve down {" ".join(service_names)}' + f'{colorama.Style.RESET_ALL}.') + msg = (f'Tearing down sky serve controller: {controller_name}.') + click.echo(msg) + + def _down_or_stop_clusters( names: List[str], apply_to_all: Optional[bool], @@ -2751,10 +2770,29 @@ def _down_or_stop_clusters( f'{names_str} is currently not supported.\n' 'Please omit the cluster(s) with reserved prefix ' f'{name_to_reserved_prefix.values()}.') - raise click.UsageError( - f'{operation} cluster(s) with reserved prefix ' - f'{reserve_prefix_str} is not supported. To teardown a ' - 'service, please use `sky serve down`.') + if not down: + raise click.UsageError( + f'{operation} cluster(s) with reserved prefix ' + f'{reserve_prefix_str} is not supported. To teardown a ' + 'service, please use `sky serve down`.') + else: + if len(name_to_reserved_prefix) > 1: + raise click.UsageError( + f'{operation} multiple clusters with reserved prefix ' + f'{reserve_prefix_str} is currently not supported.\n' + 'Please omit all but one of the clusters.') + # We can only teardown one reserved cluster (sky serve + # controller) for now. + _hint_or_raise_for_down_sky_serve_controller( + list(name_to_reserved_prefix.keys())[0]) + confirm_str = 'delete' + user_input = click.prompt( + f'To proceed, please type {colorama.Style.BRIGHT}' + f'{confirm_str!r}{colorama.Style.RESET_ALL}', + type=str) + if user_input != confirm_str: + raise click.Abort() + no_confirm = True # Make sure the reserved clusters are explicitly specified without other # normal clusters. if len(reserved_clusters) > 0: @@ -4075,34 +4113,9 @@ def serve_up( app_port = int(task.service.app_port) task.set_resources(requested_resources.copy(ports=[app_port])) - controller_resources_config: Dict[str, Any] = copy.copy( - serve_lib.CONTROLLER_RESOURCES) - if task.service.controller_resources is not None: - controller_resources_config.update(task.service.controller_resources) - # TODO(tian): We might need a thorough design on this. - if 'ports' not in controller_resources_config: - controller_resources_config['ports'] = [] - controller_resources_config['ports'].append(app_port) - try: - controller_resources = sky.Resources.from_yaml_config( - controller_resources_config) - except ValueError as e: - raise ValueError( - 'Encountered error when parsing controller resources') from e - click.secho('Service Spec:', fg='cyan') click.echo(task.service) - dummy_controller_task = sky.Task().set_resources(controller_resources) - click.secho('The controller will use the following resource:', fg='cyan') - with sky.Dag() as dag: - dag.add(dummy_controller_task) - sky.optimize(dag) - click.echo() - - dummy_controller_task: sky.Task = dag.tasks[0] - controller_best_resources = dummy_controller_task.best_resources - click.secho('Each replica will use the following resource:', fg='cyan') with sky.Dag() as dag: dag.add(task) @@ -4114,8 +4127,7 @@ def serve_up( if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) - sky.serve_up(task, service_name, controller_resources, - controller_best_resources) + sky.serve_up(task, service_name) @serve.command('status', cls=_DocumentedCodeCommand) @@ -4217,8 +4229,7 @@ def serve_status(all: bool, service_name: Optional[str]): f'Replicas{colorama.Style.RESET_ALL}') replica_infos = [] for service_record in service_records: - service_handle: serve_lib.ServiceHandle = service_record['handle'] - for replica_record in service_handle.replica_info: + for replica_record in service_record['replica_info']: replica_record['service_name'] = service_record['name'] replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) @@ -4314,17 +4325,18 @@ def _down_service(name: str): try: sky.serve_down(name, purge) except RuntimeError as e: - message = (f'{colorama.Fore.RED}Teardown service {name}...failed. ' - 'Please manually clean up the replicas and then use ' - '--purge to clean up the controller.' - f'{colorama.Style.RESET_ALL}' - f'\nReason: {common_utils.format_exception(e)}.') + message = ( + f'{colorama.Fore.RED}Tearing down service {name}...failed. ' + 'Please manually clean up the replicas and then use --purge ' + f'to clean up the controller.{colorama.Style.RESET_ALL}' + f'\nReason: {common_utils.format_exception(e)}.') except (exceptions.NotSupportedError, exceptions.ClusterOwnerIdentityMismatchError) as e: message = str(e) else: - message = (f'{colorama.Fore.GREEN}Teardown service {name}...done.' - f'{colorama.Style.RESET_ALL}') + message = ( + f'{colorama.Fore.GREEN}Tearing down service {name}...done.' + f'{colorama.Style.RESET_ALL}') success_progress = True progress.stop() @@ -4390,17 +4402,11 @@ def serve_logs( raise click.UsageError( 'One and only one of --controller, --load-balancer, ' '[REPLICA_ID] can be specified.') - service_record = global_user_state.get_service_from_name(service_name) - if service_record is None: - click.secho(f'Service {service_name!r} not found.', fg='red') - return - controller_cluster_name = service_record['handle'].controller_cluster_name - if controller: - core.tail_logs(controller_cluster_name, job_id=1, follow=follow) - elif load_balancer: - core.tail_logs(controller_cluster_name, job_id=2, follow=follow) - else: - core.serve_tail_logs(service_record, replica_id, follow=follow) + core.serve_tail_logs(service_name, + controller, + load_balancer, + replica_id, + follow=follow) # ============================== diff --git a/sky/core.py b/sky/core.py index d45988e8ca0..d758ee22b45 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1,6 +1,7 @@ """SDK functions for cluster/job management.""" import getpass import sys +import typing from typing import Any, Dict, List, Optional, Union import colorama @@ -24,6 +25,9 @@ from sky.utils import tpu_utils from sky.utils import ux_utils +if typing.TYPE_CHECKING: + from sky import serve + logger = sky_logging.init_logger(__name__) # ====================== @@ -115,9 +119,36 @@ def service_status(service_name: Optional[str]) -> List[Dict[str, Any]]: @usage_lib.entrypoint -def serve_tail_logs(service_record: Dict[str, Any], replica_id: int, - follow: bool) -> None: - service_name = service_record['name'] +def serve_tail_logs(service_name: str, + controller: bool = False, + load_balancer: bool = False, + replica_id: Optional[int] = None, + follow: bool = True) -> None: + """Tail logs for a service. + + Usage: + core.serve_tail_logs(service_name, =, follow=True/False) + + One and only one of must be specified: controller, load_balancer, + or replica_id. + + To tail controller logs: + # follow default to True + core.serve_tail_logs(service_name, controller=True) + + To print replica 3 logs: + core.serve_tail_logs(service_name, replica_id=3, follow=False) + """ + have_replica_id = replica_id is not None + if (controller + load_balancer + have_replica_id) != 1: + with ux_utils.print_exception_no_traceback(): + raise ValueError('One and only one of controller, load_balancer, ' + 'or replica_id must be specified.') + service_record = global_user_state.get_service_from_name(service_name) + if service_record is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name!r} does not exist. ' + 'Cannot stream logs.') if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: with ux_utils.print_exception_no_traceback(): raise ValueError( @@ -127,15 +158,20 @@ def serve_tail_logs(service_record: Dict[str, Any], replica_id: int, with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service {service_name!r}\'s controller failed. ' 'Cannot tail logs.') - controller_cluster_name = service_record['handle'].controller_cluster_name - handle = global_user_state.get_handle_from_cluster_name( - controller_cluster_name) + service_handle: 'serve.ServiceHandle' = service_record['handle'] + controller_name = service_record['controller_name'] + handle = global_user_state.get_handle_from_cluster_name(controller_name) if handle is None: raise ValueError(f'Cannot find controller for service {service_name}.') assert isinstance(handle, backends.CloudVmRayResourceHandle), handle backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend), backend - backend.tail_serve_logs(handle, service_name, replica_id, follow=follow) + backend.tail_serve_logs(handle, + service_handle, + controller, + load_balancer, + replica_id, + follow=follow) @usage_lib.entrypoint diff --git a/sky/execution.py b/sky/execution.py index f95dfd9a50c..e680c48198a 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -16,19 +16,18 @@ import enum import getpass import os +import re import tempfile import time from typing import Any, Dict, List, Optional, Union import uuid import colorama -import requests -from rich import console as rich_console +import filelock import sky from sky import backends from sky import clouds -from sky import core from sky import exceptions from sky import global_user_state from sky import optimizer @@ -184,7 +183,7 @@ def _execute( # Internal only: # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, -) -> None: +) -> Optional[int]: """Execute an entrypoint. If sky.Task is given or DAG has not been optimized yet, this will call @@ -222,6 +221,10 @@ def _execute( idle_minutes_to_autostop: int; if provided, the cluster will be set to autostop after this many minutes of idleness. no_setup: bool; whether to skip setup commands or not when (re-)launching. + + Returns: + A job id (int) if the job is submitted successfully and backend is + CloudVmRayBackend, otherwise None. """ dag = _convert_to_dag(entrypoint) assert len(dag) == 1, f'We support 1 task for now. {dag}' @@ -326,6 +329,7 @@ def _execute( # Optimizer should eventually choose where to store bucket task.sync_storage_mounts() + job_id = None try: if Stage.PROVISION in stages: if handle is None: @@ -338,7 +342,7 @@ def _execute( if dryrun and handle is None: logger.info('Dryrun finished.') - return + return None if Stage.SYNC_WORKDIR in stages and not dryrun: if task.workdir is not None: @@ -363,7 +367,10 @@ def _execute( if Stage.EXEC in stages: try: global_user_state.update_last_use(handle.get_cluster_name()) - backend.execute(handle, task, detach_run, dryrun=dryrun) + job_id = backend.execute(handle, + task, + detach_run, + dryrun=dryrun) finally: # Enables post_execute() to be run after KeyboardInterrupt. backend.post_execute(handle, down) @@ -393,6 +400,7 @@ def _execute( not cluster_name.startswith(serve.CONTROLLER_PREFIX)): print() print('\x1b[?25h', end='') # Show cursor. + return job_id @timeline.event @@ -600,6 +608,20 @@ def exec( # pylint: disable=redefined-builtin detach_run=detach_run) +def _shared_controller_env_vars() -> Dict[str, Any]: + return { + 'SKYPILOT_USER_ID': common_utils.get_user_hash(), + 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': 1, + # Should not use $USER here, as that env var can be empty when + # running in a container. + 'SKYPILOT_USER': getpass.getuser(), + 'SKYPILOT_DEV': env_options.Options.IS_DEVELOPER.get(), + 'SKYPILOT_DEBUG': env_options.Options.SHOW_DEBUG_INFO.get(), + 'SKYPILOT_DISABLE_USAGE_COLLECTION': + env_options.Options.DISABLE_LOGGING.get(), + } + + @usage_lib.entrypoint def spot_launch( task: Union['sky.Task', 'sky.Dag'], @@ -670,17 +692,11 @@ def spot_launch( 'uuid': dag_uuid, 'google_sdk_installation_commands': gcp.GOOGLE_SDK_INSTALLATION_COMMAND, - 'is_dev': env_options.Options.IS_DEVELOPER.get(), - 'is_debug': env_options.Options.SHOW_DEBUG_INFO.get(), - 'disable_logging': env_options.Options.DISABLE_LOGGING.get(), - 'logging_user_hash': common_utils.get_user_hash(), 'retry_until_up': retry_until_up, - # Should not use $USER here, as that env var can be empty when - # running in a container. - 'user': getpass.getuser(), } controller_resources_config = copy.copy( spot.constants.CONTROLLER_RESOURCES) + spot_env_vars = _shared_controller_env_vars() if skypilot_config.loaded(): # Look up the contents of the already loaded configs via the # 'skypilot_config' module. Don't simply read the on-disk file as @@ -728,12 +744,16 @@ def spot_launch( proxy_command_key, ssh_proxy_command) with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmpfile: + prefix = spot.SPOT_TASK_YAML_PREFIX + remote_user_config_path = ( + f'{prefix}/{dag.name}-{dag_uuid}.config_yaml') common_utils.dump_yaml(tmpfile.name, config_dict) vars_to_fill.update({ 'user_config_path': tmpfile.name, - 'env_var_skypilot_config': - skypilot_config.ENV_VAR_SKYPILOT_CONFIG, + 'remote_user_config_path': remote_user_config_path, }) + spot_env_vars[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = ( + remote_user_config_path) # Override the controller resources with the ones specified in the # config. @@ -773,6 +793,8 @@ def spot_launch( controller_task.spot_dag = dag assert len(controller_task.resources) == 1 + controller_task.update_envs(spot_env_vars) + print(f'{colorama.Fore.YELLOW}' f'Launching managed spot job {dag.name} from spot controller...' f'{colorama.Style.RESET_ALL}') @@ -962,8 +984,6 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): def serve_up( task: 'sky.Task', service_name: str, - controller_resources: 'sky.Resources', - controller_best_resources: 'sky.Resources', ): """Spin up a service. @@ -972,27 +992,78 @@ def serve_up( Args: task: sky.Task to serve up. service_name: Name of the service. - controller_resources: The resources requirement for the controller. - controller_best_resources: The optimized resources for the controller. """ - controller_cluster_name = serve.generate_controller_cluster_name( - service_name) - controller_best_resources.cloud.check_cluster_name_is_valid( - controller_cluster_name) + if re.fullmatch(serve.SERVICE_NAME_VALID_REGEX, service_name) is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service name {service_name!r} is invalid: ' + f'ensure it is fully matched by regex (e.g., ' + 'only contains lower letters, numbers and dash): ' + f'{serve.SERVICE_NAME_VALID_REGEX}') + + if global_user_state.get_service_from_name(service_name) is not None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service name {service_name!r} is already ' + 'taken. Please use a different name.') + + if task.service is None: + raise RuntimeError('Service section not found.') + controller_resources_config: Dict[str, Any] = copy.copy( + serve.CONTROLLER_RESOURCES) + if task.service.controller_resources is not None: + controller_resources_config.update(task.service.controller_resources) + if 'ports' in controller_resources_config: + with ux_utils.print_exception_no_traceback(): + raise ValueError('Cannot specify ports for controller resources.') + # TODO(tian): Open required ports only after #2485 is merged. + controller_resources_config['ports'] = [serve.LOAD_BALANCER_PORT_RANGE] + try: + controller_resources = sky.Resources.from_yaml_config( + controller_resources_config) + except ValueError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Encountered error when parsing controller resources') from e + assert task.service is not None, task assert len(task.resources) == 1, task requested_resources = list(task.resources)[0] service_handle = serve.ServiceHandle( - controller_cluster_name=controller_cluster_name, + service_name=service_name, policy=task.service.policy_str(), requested_resources=requested_resources, - replica_info=[]) - global_user_state.add_or_update_service( - service_name, None, service_handle, - status_lib.ServiceStatus.CONTROLLER_INIT) - app_port = int(task.service.app_port) + requested_controller_resources=controller_resources) + # Use filelock here to make sure only one process can write to database + # at the same time. Then we generate available controller name again to + # make sure even in race condition, we can still get the correct controller + # name. + # In the same time, generate ports for the controller and load balancer. + # Use file lock to make sure the ports are unique to each service. + try: + # TODO(tian): remove pylint disabling when filelock + # version updated + # pylint: disable=abstract-class-instantiated + with filelock.FileLock(serve.CONTROLLER_FILE_LOCK_PATH, + serve.CONTROLLER_FILE_LOCK_TIMEOUT): + controller_name, _ = serve.get_available_controller_name( + controller_resources) + global_user_state.add_or_update_service( + service_name, None, controller_name, service_handle, + status_lib.ServiceStatus.CONTROLLER_INIT) + + controller_port, load_balancer_port = ( + serve.gen_ports_for_serve_process(controller_name)) + service_handle.controller_port = controller_port + service_handle.load_balancer_port = load_balancer_port + global_user_state.set_service_handle(service_name, service_handle) + except filelock.Timeout as e: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'Timeout when obtaining controller lock for service ' + f'{service_name!r}. Please check if there are some ' + '`sky serve up` process hanging abnormally.') from e - # TODO(tian): Use skyserve constants. + # TODO(tian): Use skyserve constants, or maybe refactor these constants + # out of spot constants since their name is mostly not spot-specific. _maybe_translate_local_file_mounts_and_sync_up(task) ephemeral_storage = [] if task.storage_mounts is not None: @@ -1011,11 +1082,22 @@ def serve_up( common_utils.dump_yaml(f.name, task_config) remote_task_yaml_path = serve.generate_remote_task_yaml_file_name( service_name) + controller_log_file = ( + serve.generate_remote_controller_log_file_name(service_name)) + load_balancer_log_file = ( + serve.generate_remote_load_balancer_log_file_name(service_name)) vars_to_fill = { 'remote_task_yaml_path': remote_task_yaml_path, 'local_task_yaml_path': f.name, 'google_sdk_installation_commands': gcp.GOOGLE_SDK_INSTALLATION_COMMAND, + 'service_dir': serve.generate_remote_service_dir_name(service_name), + 'service_name': service_name, + 'controller_port': controller_port, + 'load_balancer_port': load_balancer_port, + 'app_port': task.service.app_port, + 'controller_log_file': controller_log_file, + 'load_balancer_log_file': load_balancer_log_file, } controller_yaml_path = serve.generate_controller_yaml_file_name( service_name) @@ -1023,130 +1105,81 @@ def serve_up( vars_to_fill, output_path=controller_yaml_path) controller_task = task_lib.Task.from_yaml(controller_yaml_path) - # This is for the case when the best resources failed to provision. controller_task.set_resources(controller_resources) - controller_task.best_resources = controller_best_resources - - controller_envs = { - 'SKYPILOT_USER_ID': common_utils.get_user_hash(), - 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': True, - 'SKYPILOT_USER': getpass.getuser(), - 'SKYPILOT_DEV': env_options.Options.IS_DEVELOPER.get(), - 'SKYPILOT_DEBUG': env_options.Options.SHOW_DEBUG_INFO.get(), - 'SKYPILOT_DISABLE_USAGE_COLLECTION': - env_options.Options.DISABLE_LOGGING.get(), - } - controller_task.update_envs(controller_envs) - print(f'{colorama.Fore.YELLOW}' - f'Launching controller for {service_name}...' - f'{colorama.Style.RESET_ALL}') + # Set this flag to modify default ray task CPU usage to custom value + # instead of default 0.5 vCPU. We need to set it to a smaller value + # to support a larger number of services. + controller_task.is_sky_serve_controller_task = True - _execute( + controller_task.update_envs(_shared_controller_env_vars()) + + fore = colorama.Fore + style = colorama.Style + print(f'\n{fore.YELLOW}Launching controller for {service_name}...' + f'{style.RESET_ALL}') + job_id = _execute( entrypoint=controller_task, - stream_logs=True, - cluster_name=controller_cluster_name, + stream_logs=False, + cluster_name=controller_name, + detach_run=True, + # We use autostop here to reduce cold start time, since in most + # cases the controller resources requirement will be the default + # value and a previous controller could be reused. + idle_minutes_to_autostop=serve.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) - cluster_record = global_user_state.get_cluster_from_name( - controller_cluster_name) - if (cluster_record is None or - cluster_record['status'] != status_lib.ClusterStatus.UP): + controller_record = global_user_state.get_cluster_from_name( + controller_name) + if controller_record is None: global_user_state.set_service_status( service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - print(f'{colorama.Fore.RED}Controller failed to launch. ' - f'Please check the logs above.{colorama.Style.RESET_ALL}') - return - - handle = cluster_record['handle'] + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'Controller failed to launch. Please check the logs above.') + handle = controller_record['handle'] assert isinstance(handle, backends.CloudVmRayResourceHandle) - endpoint = f'{handle.head_ip}:{app_port}' - service_handle.endpoint = endpoint + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend), backend + service_handle.endpoint_ip = handle.head_ip global_user_state.set_service_handle(service_name, service_handle) - console = rich_console.Console() - - def _wait_until_job_is_running(cluster_name: str, - job_id: int, - retry_time: int = 30) -> bool: - handle = global_user_state.get_handle_from_cluster_name( - cluster_name) - assert isinstance(handle, backends.CloudVmRayResourceHandle), handle - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend), backend - for _ in range(retry_time): + def _wait_until_job_is_running_on_controller( + job_id: Optional[int]) -> bool: + if job_id is None: + return False + for _ in range(serve.SERVE_STARTUP_TIMEOUT): job_statuses = backend.get_job_status(handle, [job_id], stream_logs=False) job_status = job_statuses.get(str(job_id), None) if job_status == job_lib.JobStatus.RUNNING: return True time.sleep(1) + # Cancel any jobs that are still pending after timeout. + if job_status == job_lib.JobStatus.PENDING: + backend.cancel_jobs(handle, jobs=[job_id]) return False - # NOTICE: The job submission order cannot be changed since the - # `sky serve logs` CLI will identify the controller job with - # the first job submitted and the load balancer job with the second - # job submitted. - with console.status('[yellow]Launching controller process...[/yellow]'): - _execute( - entrypoint=sky.Task( - name='run-controller', - envs=controller_envs, - run='python -m sky.serve.controller --service-name ' - f'{service_name} --task-yaml {remote_task_yaml_path} ' - f'--port {serve.CONTROLLER_PORT}'), - stream_logs=False, - handle=handle, - stages=[Stage.EXEC], - cluster_name=controller_cluster_name, - detach_run=True, - ) - controller_job_is_running = _wait_until_job_is_running( - controller_cluster_name, 1) - if not controller_job_is_running: + if not _wait_until_job_is_running_on_controller(job_id): global_user_state.set_service_status( service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - print(f'{colorama.Fore.RED}Controller failed to launch. ' - f'Please check the logs with sky serve logs {service_name} ' - f'--controller{colorama.Style.RESET_ALL}') - return - print(f'{colorama.Fore.GREEN}Launching controller process...done.' - f'{colorama.Style.RESET_ALL}') + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'Controller failed to launch. Please check ' + f'the logs with sky serve logs {service_name} ' + '--controller') - with console.status( - '[yellow]Launching load balancer process...[/yellow]'): - controller_addr = f'http://localhost:{serve.CONTROLLER_PORT}' - _execute( - entrypoint=sky.Task( - name='run-load-balancer', - envs=controller_envs, - run='python -m sky.serve.load_balancer --task-yaml ' - f'{remote_task_yaml_path} --port {app_port} ' - f'--controller-addr {controller_addr}'), - stream_logs=False, - handle=handle, - stages=[Stage.EXEC], - cluster_name=controller_cluster_name, - detach_run=True, - ) - load_balancer_job_is_running = _wait_until_job_is_running( - controller_cluster_name, 2) - if not load_balancer_job_is_running: - global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - print(f'{colorama.Fore.RED}LoadBalancer failed to launch. ' - f'Please check the logs with sky serve logs {service_name} ' - f'--load-balancer{colorama.Style.RESET_ALL}') - return - print(f'{colorama.Fore.GREEN}Launching load balancer process...done.' - f'{colorama.Style.RESET_ALL}') + service_handle.job_id = job_id + global_user_state.set_service_handle(service_name, service_handle) + print(f'{fore.GREEN}Launching controller for {service_name}...done.' + f'{style.RESET_ALL}') global_user_state.set_service_status( service_name, status_lib.ServiceStatus.REPLICA_INIT) - print(f'\n{colorama.Fore.CYAN}Service name: ' - f'{colorama.Style.BRIGHT}{service_name}{colorama.Style.RESET_ALL}' + print(f'\n{fore.CYAN}Service name: ' + f'{style.BRIGHT}{service_name}{style.RESET_ALL}' '\nTo see detailed info:' f'\t\t{backend_utils.BOLD}sky serve status {service_name} (-a)' f'{backend_utils.RESET_BOLD}' @@ -1164,13 +1197,10 @@ def _wait_until_job_is_running(cluster_name: str, f'{backend_utils.RESET_BOLD}' f'\n(use {backend_utils.BOLD}sky serve status {service_name}' f'{backend_utils.RESET_BOLD} to get all valid REPLICA_ID)') - print(f'\n{colorama.Style.BRIGHT}{colorama.Fore.CYAN}' - 'Endpoint URL: ' - f'{colorama.Style.RESET_ALL}{colorama.Fore.CYAN}' - f'{endpoint}' - f'{colorama.Style.RESET_ALL}') - print(f'{colorama.Fore.GREEN}Starting replica now...' - f'{colorama.Style.RESET_ALL}') + print(f'\n{style.BRIGHT}{fore.CYAN}Endpoint URL: ' + f'{style.RESET_ALL}{fore.CYAN}' + f'{handle.head_ip}:{load_balancer_port}{style.RESET_ALL}') + print(f'{fore.GREEN}Starting replica now...{style.RESET_ALL}') print('Please use the above command to find the latest status.') @@ -1187,34 +1217,41 @@ def serve_down( purge: If true, ignore errors when cleaning up the controller. """ service_record = global_user_state.get_service_from_name(service_name) - # Already filtered all inexistent service in cli.py - assert service_record is not None, service_name - controller_cluster_name = service_record['handle'].controller_cluster_name + + if service_record is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name!r} not found.') + + service_handle: serve.ServiceHandle = service_record['handle'] + controller_name = service_record['controller_name'] global_user_state.set_service_status(service_name, status_lib.ServiceStatus.SHUTTING_DOWN) - handle = global_user_state.get_handle_from_cluster_name( - controller_cluster_name) + handle = global_user_state.get_handle_from_cluster_name(controller_name) if handle is not None: backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) try: - code = serve.ServeCodeGen.terminate_service() + if service_handle.controller_port is None: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'Controller job of service {service_name} not found.') + + code = serve.ServeCodeGen.terminate_service( + service_handle.controller_port) returncode, terminate_service_payload, stderr = backend.run_on_head( handle, code, require_outputs=True, stream_logs=False, separate_stderr=True) - try: - subprocess_utils.handle_returncode( - returncode, - code, ('Failed when submit terminate request to controller ' - f'of service {service_name}'), - stderr, - stream_logs=False) - except exceptions.CommandError as e: - raise RuntimeError(e.error_msg) from e + subprocess_utils.handle_returncode( + returncode, + code, ('Failed when submit terminate request to controller ' + f'of service {service_name}'), + stderr, + stream_logs=False) + resp = serve.load_terminate_service_result( terminate_service_payload) if resp.status_code != 200: @@ -1227,8 +1264,10 @@ def serve_down( 'Unexpected message when tearing down replica of service ' f'{service_name}: {msg}. Please login to the controller ' 'and make sure the service is properly cleaned.') - except (RuntimeError, ValueError, - requests.exceptions.ConnectionError) as e: + + # We want to make sure no matter what error happens, we can still + # clean up the record if purge is True. + except Exception as e: # pylint: disable=broad-except if purge: logger.warning('Ignoring error when cleaning replicas of ' f'{service_name}: {e}') @@ -1238,31 +1277,45 @@ def serve_down( if not purge: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - f'Cannot find controller cluster of service {service_name}.' - ) + f'Cannot find controller of service {service_name}.') try: if handle is not None: assert isinstance(handle, backends.CloudVmRayResourceHandle) backend = backends.CloudVmRayBackend() - backend.cancel_jobs(handle, jobs=None, cancel_all=True, silent=True) - except (ValueError, exceptions.ClusterNotUpError, - exceptions.CommandError) as e: + + # Cancel the controller and load balancer jobs. + # For the case when controller / load_balancer job failed to submit. + jobs = [] + if service_handle.job_id is not None: + jobs.append(service_handle.job_id) + backend.cancel_jobs(handle, jobs=jobs, silent=True) + + # Cleanup all files on controller related to this service. + # We have a 10-min grace period for the controller to autostop, + # so it should be fine if this is the last service on the + # controller and its job is the only one running. + code = serve.ServeCodeGen.cleanup_service_files(service_name) + returncode, _, stderr = backend.run_on_head(handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + subprocess_utils.handle_returncode( + returncode, + code, ('Failed when cleaning up utility files on controller ' + f'of service {service_name}'), + stderr, + stream_logs=False) + + # same as above. + except Exception as e: # pylint: disable=broad-except if purge: logger.warning('Ignoring error when stopping controller and ' f'load balancer jobs of service {service_name}: {e}') else: raise RuntimeError(e) from e - try: - core.down(controller_cluster_name, purge=purge) - except (RuntimeError, ValueError) as e: - if purge: - logger.warning('Ignoring error when terminating controller VM of ' - f'service {service_name}: {e}') - else: - raise RuntimeError(e) from e - # TODO(tian): Maybe add a post_cleanup function? controller_yaml_path = serve.generate_controller_yaml_file_name( service_name) diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 98acdfd9eb3..6cf719d8e69 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -98,6 +98,7 @@ def create_table(cursor, conn): CREATE TABLE IF NOT EXISTS services ( name TEXT PRIMARY KEY, launched_at INTEGER, + controller_name TEXT, handle BLOB, status TEXT)""") # For backward compatibility. @@ -281,18 +282,20 @@ def add_or_update_cluster(cluster_name: str, def add_or_update_service(name: str, launched_at: Optional[int], - handle: 'serve.ServiceHandle', + controller_name: str, handle: 'serve.ServiceHandle', status: status_lib.ServiceStatus) -> None: if launched_at is None: launched_at = int(time.time()) _DB.cursor.execute( 'INSERT or REPLACE INTO services' - '(name, launched_at, handle, status) ' + '(name, launched_at, controller_name, handle, status) ' 'VALUES (' # name '?, ' # launched_at '?, ' + # controller_name + '?, ' # handle '?, ' # status @@ -303,6 +306,8 @@ def add_or_update_service(name: str, launched_at: Optional[int], name, # launched_at launched_at, + # controller_name + controller_name, # handle pickle.dumps(handle), # status @@ -610,11 +615,12 @@ def _get_service_from_row(row) -> Dict[str, Any]: # Explicitly specify the number of fields to unpack, so that # we can add new fields to the database in the future without # breaking the previous code. - name, launched_at, handle, status = row[:4] + name, launched_at, controller_name, handle, status = row[:5] # TODO: use namedtuple instead of dict return { 'name': name, 'launched_at': launched_at, + 'controller_name': controller_name, 'handle': pickle.loads(handle), 'status': status_lib.ServiceStatus[status], } @@ -629,6 +635,18 @@ def get_service_from_name( return None +def get_services_from_controller_name( + controller_name: str) -> List[Dict[str, Any]]: + rows = _DB.cursor.execute( + 'SELECT * FROM services WHERE controller_name=(?)', + (controller_name,)).fetchall() + records = [] + for row in rows: + record = _get_service_from_row(row) + records.append(record) + return records + + def get_handle_from_service_name( service_name: Optional[str]) -> Optional['serve.ServiceHandle']: rows = _DB.cursor.execute('SELECT handle FROM services WHERE name=(?)', diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 68bb6a02581..7a581ac4c50 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -1,15 +1,30 @@ """Modules for SkyServe services.""" -from sky.serve.constants import CONTROLLER_PORT +import os + +from sky.serve.constants import CONTROLLER_FILE_LOCK_PATH +from sky.serve.constants import CONTROLLER_FILE_LOCK_TIMEOUT +from sky.serve.constants import CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP from sky.serve.constants import CONTROLLER_PREFIX from sky.serve.constants import CONTROLLER_RESOURCES from sky.serve.constants import CONTROLLER_SYNC_INTERVAL from sky.serve.constants import CONTROLLER_TEMPLATE -from sky.serve.serve_utils import generate_controller_cluster_name +from sky.serve.constants import LOAD_BALANCER_PORT_RANGE +from sky.serve.constants import SERVE_PREFIX +from sky.serve.constants import SERVE_STARTUP_TIMEOUT +from sky.serve.constants import SERVICE_NAME_VALID_REGEX +from sky.serve.constants import SERVICES_TASK_CPU_DEMAND +from sky.serve.serve_utils import gen_ports_for_serve_process from sky.serve.serve_utils import generate_controller_yaml_file_name +from sky.serve.serve_utils import generate_remote_controller_log_file_name +from sky.serve.serve_utils import generate_remote_load_balancer_log_file_name +from sky.serve.serve_utils import generate_remote_service_dir_name from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name +from sky.serve.serve_utils import get_available_controller_name from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import load_terminate_service_result from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceHandle from sky.serve.service_spec import SkyServiceSpec + +os.makedirs(SERVE_PREFIX, exist_ok=True) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 91ea9b48c46..d00ea4d2689 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -1,19 +1,50 @@ """Constants used for SkyServe.""" +CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 + CONTROLLER_PREFIX = 'sky-serve-controller-' CONTROLLER_TEMPLATE = 'sky-serve-controller.yaml.j2' SERVE_PREFIX = '~/.sky/serve' -CONTROLLER_PORT = 31001 +# This is the same with sky.skylet.constants.CLUSTER_NAME_VALID_REGEX +# The service name will be used as: +# 1. controller cluster name: 'sky-serve-controller-' +# 2. replica cluster name: '-' +# In both cases, service name shares the same regex with cluster name. +SERVICE_NAME_VALID_REGEX = '[a-z]([-a-z0-9]*[a-z0-9])?' + +CONTROLLER_FILE_LOCK_PATH = f'{SERVE_PREFIX}/controller.lock' +CONTROLLER_FILE_LOCK_TIMEOUT = 20 + CONTROLLER_SYNC_INTERVAL = 20 +READINESS_PROBE_TIMEOUT = 15 -# We need 200GB disk space to enable using Azure as controller, since its image -# size is 150GB. -CONTROLLER_RESOURCES = {'disk_size': 200, 'cpus': '4+'} +SERVE_STARTUP_TIMEOUT = 60 + +# We need 200 GB disk space to enable using Azure as controller, since its image +# size is 150 GB. Also, we need 32 GB memory to run our controller and load +# balancer jobs since it is very memory demanding. +# TODO(tian): We might need to be careful that service logs can take a lot of +# disk space. Maybe we could use a larger disk size or migrate to cloud storage. +CONTROLLER_RESOURCES = {'disk_size': 200, 'memory': '32+'} + +# Our ray jobs is very memory demanding and number of services on a single +# controller is limited by memory. Rough benchmark result shows each service +# needs ~0.6 GB to run only for controller and load balancer process. +# Considering there will be some sky launch and sky down process on the fly, we +# set the memory usage to 2 GB to be safe. +# In this setup, a default highmem controller with 4 vCPU and 32 GB memory can +# run 16 services. +SERVICES_MEMORY_USAGE_GB = 2.0 +SERVICES_TASK_CPU_DEMAND = 0.125 # A period of time to initialize your service. Any readiness probe failures # during this period will be ignored. DEFAULT_INITIAL_DELAY_SECONDS = 1200 DEFAULT_MIN_REPLICAS = 1 + +CONTROLLER_PORT_START = 20001 +LOAD_BALANCER_PORT_START = 30001 +LOAD_BALANCER_PORT_RANGE = '30001-31000' diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 637c6cb3231..5d3f9b4184a 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -123,8 +123,7 @@ def terminate(request: fastapi.Request): type=str, help='Task YAML file', required=True) - parser.add_argument('--port', - '-p', + parser.add_argument('--controller-port', type=int, help='Port to run the controller', required=True) @@ -139,6 +138,7 @@ def terminate(request: fastapi.Request): _infra_provider = infra_providers.SkyPilotInfraProvider( args.task_yaml, args.service_name, + controller_port=args.controller_port, readiness_suffix=service_spec.readiness_suffix, initial_delay_seconds=service_spec.initial_delay_seconds, post_data=service_spec.post_data) @@ -156,5 +156,6 @@ def terminate(request: fastapi.Request): query_interval=60) # ======= SkyServeController ========= - controller = SkyServeController(args.port, _infra_provider, _autoscaler) + controller = SkyServeController(args.controller_port, _infra_provider, + _autoscaler) controller.run() diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index c7419c9d524..64ff4093bd3 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -15,6 +15,7 @@ from sky import backends from sky import global_user_state from sky import status_lib +from sky.serve import constants from sky.serve import serve_utils from sky.skylet import job_lib from sky.utils import env_options @@ -195,11 +196,13 @@ class InfraProvider: def __init__( self, + controller_port: int, readiness_suffix: str, initial_delay_seconds: int, post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: self.replica_info: serve_utils.ThreadSafeDict[ str, ReplicaInfo] = serve_utils.ThreadSafeDict() + self.controller_port = controller_port self.readiness_suffix: str = readiness_suffix self.initial_delay_seconds: int = initial_delay_seconds self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data @@ -414,7 +417,8 @@ def _launch_cluster(self, replica_id: int) -> None: logger.info(f'Creating SkyPilot cluster {cluster_name}') cmd = ['sky', 'launch', self.task_yaml_path, '-c', cluster_name, '-y'] cmd.extend(['--detach-setup', '--detach-run', '--retry-until-up']) - fn = serve_utils.generate_replica_launch_log_file_name(cluster_name) + fn = serve_utils.generate_replica_launch_log_file_name( + self.service_name, replica_id) with open(fn, 'w') as f: # pylint: disable=consider-using-with p = subprocess.Popen(cmd, @@ -442,17 +446,18 @@ def _teardown_cluster(self, 'exists. Skipping.') return + replica_id = serve_utils.get_replica_id_from_cluster_name(cluster_name) if sync_down_logs: logger.info(f'Syncing down logs for cluster {cluster_name}...') - replica_id = serve_utils.get_replica_id_from_cluster_name( - cluster_name) - code = serve_utils.ServeCodeGen.stream_logs( + code = serve_utils.ServeCodeGen.stream_replica_logs( self.service_name, + self.controller_port, replica_id, follow=False, skip_local_log_file_check=True) local_log_file_name = ( - serve_utils.generate_replica_local_log_file_name(cluster_name)) + serve_utils.generate_replica_local_log_file_name( + self.service_name, replica_id)) with open(local_log_file_name, 'w') as f: try: subprocess.run(code, shell=True, check=True, stdout=f) @@ -466,7 +471,8 @@ def _teardown_cluster(self, logger.info(f'Deleting SkyPilot cluster {cluster_name}') cmd = ['sky', 'down', cluster_name, '-y'] - fn = serve_utils.generate_replica_down_log_file_name(cluster_name) + fn = serve_utils.generate_replica_down_log_file_name( + self.service_name, replica_id) with open(fn, 'w') as f: # pylint: disable=consider-using-with p = subprocess.Popen(cmd, @@ -573,12 +579,15 @@ def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: readiness_suffix = f'http://{replica_ip}{self.readiness_suffix}' if self.post_data is not None: msg += 'Post' - response = requests.post(readiness_suffix, - json=self.post_data, - timeout=3) + response = requests.post( + readiness_suffix, + json=self.post_data, + timeout=constants.READINESS_PROBE_TIMEOUT) else: msg += 'Get' - response = requests.get(readiness_suffix, timeout=3) + response = requests.get( + readiness_suffix, + timeout=constants.READINESS_PROBE_TIMEOUT) msg += (f' request to {replica_ip} returned status code ' f'{response.status_code}') if response.status_code == 200: diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index a73b8cb5eb6..687fbffb574 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -5,6 +5,7 @@ import fastapi import requests +from urllib3 import exceptions import uvicorn from sky import sky_logging @@ -26,26 +27,34 @@ class SkyServeLoadBalancer: """ def __init__( - self, controller_url: str, port: int, + self, controller_url: str, load_balancer_port: int, app_port: int, load_balancing_policy: load_balancing_policies.LoadBalancingPolicy ) -> None: self.app = fastapi.FastAPI() self.controller_url = controller_url - self.port = port + # This is the port where the load balancer listens to. + self.load_balancer_port = load_balancer_port + # This is the port where the replica app listens to. + self.app_port = app_port self.load_balancing_policy = load_balancing_policy - - for i in range(3): - resp = requests.get(self.controller_url + - '/controller/get_autoscaler_query_interval') + self.setup_query_interval() + + def setup_query_interval(self): + for _ in range(3): + try: + resp = requests.get(self.controller_url + + '/controller/get_autoscaler_query_interval') + except exceptions.MaxRetryError: + # Retry if cannot connect to controller + continue if resp.status_code == 200: self.load_balancing_policy.set_query_interval( resp.json()['query_interval']) - break - if i == 2: - logger.error('Failed to get autoscaler query interval. ' - 'Use default interval instead.') - self.load_balancing_policy.set_query_interval(None) + return time.sleep(10) + logger.error('Failed to get autoscaler query interval. ' + 'Use default interval instead.') + self.load_balancing_policy.set_query_interval(None) def _sync_with_controller(self): while True: @@ -83,7 +92,7 @@ async def _redirect_handler(self, request: fastapi.Request): 'Use "sky serve status [SERVICE_ID]" ' 'to check the replica status.') - path = f'http://{replica_ip}:{self.port}{request.url.path}' + path = f'http://{replica_ip}:{self.app_port}{request.url.path}' logger.info(f'Redirecting request to {path}') return fastapi.responses.RedirectResponse(url=path) @@ -96,10 +105,10 @@ def run(self): target=self._sync_with_controller, daemon=True) sync_controller_thread.start() - logger.info( - f'SkyServe Load Balancer started on http://0.0.0.0:{self.port}') + logger.info('SkyServe Load Balancer started on ' + f'http://0.0.0.0:{self.load_balancer_port}') - uvicorn.run(self.app, host='0.0.0.0', port=self.port) + uvicorn.run(self.app, host='0.0.0.0', port=self.load_balancer_port) if __name__ == '__main__': @@ -109,11 +118,14 @@ def run(self): type=str, help='Task YAML file', required=True) - parser.add_argument('--port', - '-p', + parser.add_argument('--load-balancer-port', type=int, help='Port to run the load balancer on.', required=True) + parser.add_argument('--app-port', + type=int, + help='Port that runs app on replica.', + required=True) parser.add_argument('--controller-addr', type=str, help='Controller address (ip:port).', @@ -126,6 +138,7 @@ def run(self): # ======= SkyServeLoadBalancer ========= load_balancer = SkyServeLoadBalancer( controller_url=args.controller_addr, - port=args.port, + load_balancer_port=args.load_balancer_port, + app_port=args.app_port, load_balancing_policy=_load_balancing_policy) load_balancer.run() diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 08a807f65c6..e580029a12a 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -4,16 +4,18 @@ import pickle import re import shlex +import shutil import threading import time import typing -from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, - TextIO, TypeVar) +from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, Set, + TextIO, Tuple, TypeVar) import colorama import requests from sky import backends +from sky import clouds from sky import global_user_state from sky import status_lib from sky.data import storage as storage_lib @@ -23,7 +25,7 @@ if typing.TYPE_CHECKING: import sky -_CONTROLLER_URL = f'http://localhost:{constants.CONTROLLER_PORT}' +_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}' _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*' _FAILED_TO_FIND_REPLICA_MSG = ( @@ -71,15 +73,21 @@ def values(self): return self._dict.values() -def generate_controller_cluster_name(service_name: str) -> str: - return constants.CONTROLLER_PREFIX + service_name +def get_existing_controller_names() -> Set[str]: + return { + record['controller_name'] + for record in global_user_state.get_services() + } -def generate_remote_task_yaml_file_name(service_name: str) -> str: - service_name = service_name.replace('-', '_') - # Don't expand here since it is used for remote machine. - prefix = constants.SERVE_PREFIX - return os.path.join(prefix, f'{service_name}.yaml') +def generate_controller_cluster_name(existing_controllers: Set[str]) -> str: + index = 0 + while True: + controller_name = (f'{constants.CONTROLLER_PREFIX}' + f'{common_utils.get_user_hash()}-{index}') + if controller_name not in existing_controllers: + return controller_name + index += 1 def generate_controller_yaml_file_name(service_name: str) -> str: @@ -88,74 +96,245 @@ def generate_controller_yaml_file_name(service_name: str) -> str: return os.path.join(prefix, f'{service_name}_controller.yaml') -def generate_replica_cluster_name(service_name: str, replica_id: int) -> str: - return f'{service_name}-{replica_id}' +def generate_remote_service_dir_name(service_name: str) -> str: + service_name = service_name.replace('-', '_') + return os.path.join(constants.SERVE_PREFIX, service_name) -def get_replica_id_from_cluster_name(cluster_name: str) -> int: - return int(cluster_name.split('-')[-1]) +def generate_remote_task_yaml_file_name(service_name: str) -> str: + dir_name = generate_remote_service_dir_name(service_name) + # Don't expand here since it is used for remote machine. + return os.path.join(dir_name, 'task.yaml') -def generate_replica_launch_log_file_name(cluster_name: str) -> str: - cluster_name = cluster_name.replace('-', '_') - prefix = os.path.expanduser(constants.SERVE_PREFIX) - return os.path.join(prefix, f'{cluster_name}_launch.log') +def generate_remote_controller_log_file_name(service_name: str) -> str: + dir_name = generate_remote_service_dir_name(service_name) + # Don't expand here since it is used for remote machine. + return os.path.join(dir_name, 'controller.log') -def generate_replica_down_log_file_name(cluster_name: str) -> str: - cluster_name = cluster_name.replace('-', '_') - prefix = os.path.expanduser(constants.SERVE_PREFIX) - return os.path.join(prefix, f'{cluster_name}_down.log') +def generate_remote_load_balancer_log_file_name(service_name: str) -> str: + dir_name = generate_remote_service_dir_name(service_name) + # Don't expand here since it is used for remote machine. + return os.path.join(dir_name, 'load_balancer.log') -def generate_replica_local_log_file_name(cluster_name: str) -> str: - cluster_name = cluster_name.replace('-', '_') - prefix = os.path.expanduser(constants.SERVE_PREFIX) - return os.path.join(prefix, f'{cluster_name}_local.log') +def generate_replica_launch_log_file_name(service_name: str, + replica_id: int) -> str: + dir_name = generate_remote_service_dir_name(service_name) + dir_name = os.path.expanduser(dir_name) + return os.path.join(dir_name, f'replica_{replica_id}_launch.log') + + +def generate_replica_down_log_file_name(service_name: str, + replica_id: int) -> str: + dir_name = generate_remote_service_dir_name(service_name) + dir_name = os.path.expanduser(dir_name) + return os.path.join(dir_name, f'replica_{replica_id}_down.log') + + +def generate_replica_local_log_file_name(service_name: str, + replica_id: int) -> str: + dir_name = generate_remote_service_dir_name(service_name) + dir_name = os.path.expanduser(dir_name) + return os.path.join(dir_name, f'replica_{replica_id}_local.log') + + +def generate_replica_cluster_name(service_name: str, replica_id: int) -> str: + return f'{service_name}-{replica_id}' + + +def get_replica_id_from_cluster_name(cluster_name: str) -> int: + return int(cluster_name.split('-')[-1]) + + +def gen_ports_for_serve_process(controller_name: str) -> Tuple[int, int]: + services = global_user_state.get_services_from_controller_name( + controller_name) + # Use `is None` to filter out self and all services with initialize status + existing_controller_ports, existing_load_balancer_ports = set(), set() + for service in services: + service_handle: ServiceHandle = service['handle'] + if service_handle.controller_port is not None: + existing_controller_ports.add(service_handle.controller_port) + if service_handle.load_balancer_port is not None: + existing_load_balancer_ports.add(service_handle.load_balancer_port) + # Cannot expose controller to public internet. + # We opened 30001-31000 for controller VM, so load balancer port + # should be in this range and controller port should not be in + # this range. + controller_port = constants.CONTROLLER_PORT_START + while controller_port in existing_controller_ports: + controller_port += 1 + load_balancer_port = constants.LOAD_BALANCER_PORT_START + while load_balancer_port in existing_load_balancer_ports: + load_balancer_port += 1 + return controller_port, load_balancer_port + + +def _get_service_num_on_controller_if_available( + controller_name: str, + requested_controller_resources: 'sky.Resources') -> Optional[int]: + """Get number of services on the controller if it is available. + + A controller is available if requested controller resources is less + demanding than the controller resources, and have available slots for + services. Max number of services on a controller is determined by the memory + of the controller, since ray job and our skypilot code is very memory + demanding (~1GB/service). + + Args: + controller_name: The name of the controller. + requested_controller_resources: The resources requested for controller. + + Returns: + Number of services on the controller if it is available, otherwise None. + """ + controller_available = False + max_memory_requirements = 0. + controller_record = global_user_state.get_cluster_from_name(controller_name) + if controller_record is not None: + # If controller is already created, use its launched resources. + handle = controller_record['handle'] + assert isinstance(handle, backends.CloudVmRayResourceHandle) + if requested_controller_resources.less_demanding_than( + handle.launched_resources): + controller_available = True + # Determine max number of services on this controller. + controller_cloud = handle.launched_resources.cloud + _, max_memory_requirements = ( + controller_cloud.get_vcpus_mem_from_instance_type( + handle.launched_resources.instance_type)) + else: + # Corner case: Multiple `sky serve up` are running simultaneously + # and the controller is not created yet. We created a resources + # for each initializing controller, and find the most demanding + # one to represent the controller resources. + service_records = (global_user_state.get_services_from_controller_name( + controller_name)) + for service_record in service_records: + r = service_record['handle'].requested_controller_resources + # If any service is more demanding than the requested resources, + # then the controller is available since it must be launched + # with the most demanding resources, which is more demanding + # than the requested resources. + if requested_controller_resources.less_demanding_than(r): + controller_available = True + # Don't break here since we still want to find the max + # memory requirements. + # Remove the '+' in memory requirement. + max_memory_requirements = max(max_memory_requirements, + float(r.memory.strip('+'))) + if controller_available: + # Determine max number of services on this controller. + max_services_num = int(max_memory_requirements / + constants.SERVICES_MEMORY_USAGE_GB) + # Get current number of services on this controller. + services_num_on_controller = len( + global_user_state.get_services_from_controller_name( + controller_name)) + # Only consider controllers that have available slots for services. + if services_num_on_controller < max_services_num: + return services_num_on_controller + return None + + +def get_available_controller_name( + controller_resources: 'sky.Resources') -> Tuple[str, bool]: + """Get available controller name to use. + + Only consider controllers that satisfy the requested controller resources, + and have available slots for services. + If multiple controllers are available, choose the one with most number of + services to decrease the number of controllers. + + Args: + controller_resources: The resources requested for controller. + + Returns: + A tuple of controller name and a boolean value indicating whether the + controller name is newly generated. + """ + # Get all existing controllers. + existing_controllers = get_existing_controller_names() + available_controller_to_service_num = dict() + # Get a mapping from controller name to number of services on it. + for controller_name in existing_controllers: + services_num_on_controller = ( + _get_service_num_on_controller_if_available(controller_name, + controller_resources)) + if services_num_on_controller is not None: + available_controller_to_service_num[controller_name] = ( + services_num_on_controller) + if not available_controller_to_service_num: + new_controller_name = generate_controller_cluster_name( + existing_controllers) + # This check should always be true since we already checked the + # service name is valid in `sky.serve_up`. + clouds.Cloud.check_cluster_name_is_valid(new_controller_name) + return new_controller_name, True + # If multiple controllers are available, choose the one with most number of + # services. + return max(available_controller_to_service_num, + key=lambda k: available_controller_to_service_num[k]), False class ServiceHandle(object): """A pickle-able tuple of: - - (required) Controller cluster name. + - (required) Service name. - (required) Service autoscaling policy description str. - (required) Service requested resources. - - (required) All replica info. + - (required) Service requested controller resources. - (optional) Service uptime. - - (optional) Service endpoint URL. + - (optional) Service endpoint IP. + - (optional) Controller port. + - (optional) LoadBalancer port. + - (optional) Controller and LoadBalancer job id. - (optional) Ephemeral storage generated for the service. This class is only used as a cache for information fetched from controller. """ - _VERSION = 1 + _VERSION = 0 def __init__( - self, - *, - controller_cluster_name: str, - policy: str, - requested_resources: 'sky.Resources', - replica_info: List[Dict[str, Any]], - uptime: Optional[int] = None, - endpoint: Optional[str] = None, - ephemeral_storage: Optional[List[Dict[str, Any]]] = None) -> None: + self, + *, + service_name: str, + policy: str, + requested_resources: 'sky.Resources', + requested_controller_resources: 'sky.Resources', + uptime: Optional[int] = None, + endpoint_ip: Optional[str] = None, + controller_port: Optional[int] = None, + load_balancer_port: Optional[int] = None, + job_id: Optional[int] = None, + ephemeral_storage: Optional[List[Dict[str, Any]]] = None, + ) -> None: self._version = self._VERSION - self.controller_cluster_name = controller_cluster_name - self.replica_info = replica_info + self.service_name = service_name self.uptime = uptime - self.endpoint = endpoint + self.endpoint_ip = endpoint_ip self.policy = policy self.requested_resources = requested_resources + self.requested_controller_resources = requested_controller_resources + self.controller_port = controller_port + self.load_balancer_port = load_balancer_port + self.job_id = job_id self.ephemeral_storage = ephemeral_storage def __repr__(self): return ('ServiceHandle(' - f'\n\tcontroller_cluster_name={self.controller_cluster_name},' - f'\n\treplica_info={self.replica_info},' + f'\n\tservice_name={self.service_name},' f'\n\tuptime={self.uptime},' - f'\n\tendpoint={self.endpoint},' + f'\n\tendpoint_ip={self.endpoint_ip},' f'\n\tpolicy={self.policy},' f'\n\trequested_resources={self.requested_resources},' + '\n\trequested_controller_resources=' + f'{self.requested_controller_resources},' + f'\n\tcontroller_port={self.controller_port},' + f'\n\tload_balancer_port={self.load_balancer_port},' + f'\n\tjob_id={self.job_id},' f'\n\tephemeral_storage={self.ephemeral_storage})') def cleanup_ephemeral_storage(self) -> None: @@ -170,8 +349,10 @@ def __setstate__(self, state): self.__dict__.update(state) -def get_latest_info() -> str: - resp = requests.get(_CONTROLLER_URL + '/controller/get_latest_info') +def get_latest_info(controller_port: int) -> str: + resp = requests.get( + _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + + '/controller/get_latest_info') if resp.status_code != 200: raise ValueError(f'Failed to get replica info: {resp.text}') return common_utils.encode_payload(resp.json()) @@ -185,8 +366,10 @@ def load_latest_info(payload: str) -> Dict[str, Any]: return latest_info -def terminate_service() -> str: - resp = requests.post(_CONTROLLER_URL + '/controller/terminate') +def terminate_service(controller_port: int) -> str: + resp = requests.post( + _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + + '/controller/terminate') resp = base64.b64encode(pickle.dumps(resp)).decode('utf-8') return common_utils.encode_payload(resp) @@ -197,12 +380,13 @@ def load_terminate_service_result(payload: str) -> Any: return terminate_resp -def _follow_logs(file: TextIO, - cluster_name: str, - *, - finish_stream: Callable[[], bool], - exit_if_stream_end: bool = False, - no_new_content_timeout: Optional[int] = None) -> Iterator[str]: +def _follow_replica_logs( + file: TextIO, + cluster_name: str, + *, + finish_stream: Callable[[], bool], + exit_if_stream_end: bool = False, + no_new_content_timeout: Optional[int] = None) -> Iterator[str]: line = '' log_file = None no_new_content_cnt = 0 @@ -240,7 +424,7 @@ def cluster_is_up() -> bool: # We still exit if more than 10 seconds without new # content to avoid any internal bug that causes # the launch failed and cluster status remains INIT. - for l in _follow_logs( + for l in _follow_replica_logs( f, cluster_name, finish_stream=cluster_is_up, @@ -259,16 +443,15 @@ def cluster_is_up() -> bool: time.sleep(1) -def stream_logs(service_name: str, - replica_id: int, - follow: bool, - skip_local_log_file_check: bool = False) -> str: +def stream_replica_logs(service_name: str, + controller_port: int, + replica_id: int, + follow: bool, + skip_local_log_file_check: bool = False) -> str: print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process ' f'of replica {replica_id}.{colorama.Style.RESET_ALL}') - replica_cluster_name = generate_replica_cluster_name( - service_name, replica_id) local_log_file_name = generate_replica_local_log_file_name( - replica_cluster_name) + service_name, replica_id) if not skip_local_log_file_check and os.path.exists(local_log_file_name): # When sync down, we set skip_local_log_file_check to False so it won't @@ -279,6 +462,8 @@ def stream_logs(service_name: str, print(f.read(), flush=True) return '' + replica_cluster_name = generate_replica_cluster_name( + service_name, replica_id) handle = global_user_state.get_handle_from_cluster_name( replica_cluster_name) if handle is None: @@ -286,13 +471,15 @@ def stream_logs(service_name: str, assert isinstance(handle, backends.CloudVmRayResourceHandle), handle launch_log_file_name = generate_replica_launch_log_file_name( - replica_cluster_name) + service_name, replica_id) if not os.path.exists(launch_log_file_name): return (f'{colorama.Fore.RED}Replica {replica_id} doesn\'t exist.' f'{colorama.Style.RESET_ALL}') def _get_replica_status() -> status_lib.ReplicaStatus: - resp = requests.get(_CONTROLLER_URL + '/controller/get_latest_info') + resp = requests.get( + _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + + '/controller/get_latest_info') if resp.status_code != 200: raise ValueError( f'{colorama.Fore.RED}Failed to get replica info for service ' @@ -312,10 +499,10 @@ def _get_replica_status() -> status_lib.ReplicaStatus: finish_stream = ( lambda: _get_replica_status() != status_lib.ReplicaStatus.PROVISIONING) with open(launch_log_file_name, 'r', newline='') as f: - for line in _follow_logs(f, - replica_cluster_name, - finish_stream=finish_stream, - exit_if_stream_end=not follow): + for line in _follow_replica_logs(f, + replica_cluster_name, + finish_stream=finish_stream, + exit_if_stream_end=not follow): print(line, end='', flush=True) if not follow and _get_replica_status( ) == status_lib.ReplicaStatus.PROVISIONING: @@ -335,42 +522,94 @@ def _get_replica_status() -> status_lib.ReplicaStatus: return '' +def _follow_logs(file: TextIO, exit_if_stream_end: bool) -> Iterator[str]: + line = '' + while True: + tmp = file.readline() + if tmp is not None and tmp != '': + line += tmp + if '\n' in line or '\r' in line: + yield line + line = '' + else: + if exit_if_stream_end: + break + time.sleep(1) + + +def stream_serve_process_logs(service_name: str, stream_controller: bool, + follow: bool) -> None: + if stream_controller: + log_file = generate_remote_controller_log_file_name(service_name) + else: + log_file = generate_remote_load_balancer_log_file_name(service_name) + with open(os.path.expanduser(log_file), 'r', newline='') as f: + for line in _follow_logs(f, exit_if_stream_end=not follow): + print(line, end='', flush=True) + + +def cleanup_service_files(service_name: str) -> None: + """Cleanup utility files for a service.""" + dir_name = generate_remote_service_dir_name(service_name) + dir_name = os.path.expanduser(dir_name) + if os.path.exists(dir_name): + shutil.rmtree(dir_name) + + class ServeCodeGen: """Code generator for SkyServe. Usage: - >> code = ServeCodeGen.get_latest_info() + >> code = ServeCodeGen.get_latest_info(controller_port) """ _PREFIX = [ 'from sky.serve import serve_utils', ] @classmethod - def get_latest_info(cls) -> str: + def get_latest_info(cls, controller_port: int) -> str: code = [ - 'msg = serve_utils.get_latest_info()', + f'msg = serve_utils.get_latest_info({controller_port})', 'print(msg, end="", flush=True)' ] return cls._build(code) @classmethod - def terminate_service(cls) -> str: + def terminate_service(cls, controller_port: int) -> str: code = [ - 'msg = serve_utils.terminate_service()', + f'msg = serve_utils.terminate_service({controller_port})', 'print(msg, end="", flush=True)' ] return cls._build(code) @classmethod - def stream_logs(cls, - service_name: str, - replica_id: int, - follow: bool, - skip_local_log_file_check: bool = False) -> str: + def stream_replica_logs(cls, + service_name: str, + controller_port: int, + replica_id: int, + follow: bool, + skip_local_log_file_check: bool = False) -> str: + code = [ + f'msg = serve_utils.stream_replica_logs({service_name!r}, ' + f'{controller_port}, {replica_id!r}, follow={follow}, ' + f'skip_local_log_file_check={skip_local_log_file_check})', + 'print(msg, flush=True)' + ] + return cls._build(code) + + @classmethod + def stream_serve_process_logs(cls, service_name: str, + stream_controller: bool, follow: bool) -> str: + code = [ + f'serve_utils.stream_serve_process_logs({service_name!r}, ' + f'{stream_controller}, follow={follow})', + ] + return cls._build(code) + + @classmethod + def cleanup_service_files(cls, service_name: str) -> str: code = [ - f'msg = serve_utils.stream_logs({service_name!r}, {replica_id!r}, ' - f'follow={follow}, skip_local_log_file_check=' - f'{skip_local_log_file_check})', 'print(msg, flush=True)' + f'serve_utils.cleanup_service_files({service_name!r})', ] return cls._build(code) diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 6e2867e6562..2d694ef36fb 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -37,12 +37,6 @@ def __init__( raise ValueError( 'max_replicas must be greater than or equal to min_replicas' ) - if app_port == constants.CONTROLLER_PORT: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'App port cannot be {constants.CONTROLLER_PORT} ' - 'since it is reserved for the controller. ' - 'Please use a different port.') if not readiness_path.startswith('/'): with ux_utils.print_exception_no_traceback(): raise ValueError('readiness_path must start with a slash (/). ' diff --git a/sky/task.py b/sky/task.py index 5adfceb671b..ae5ca1748d0 100644 --- a/sky/task.py +++ b/sky/task.py @@ -242,6 +242,9 @@ def __init__( # the underlying managed spot dag (sky.Dag object). self.spot_dag: Optional['sky.Dag'] = None + # Only set to True when 'self' is a sky serve controller task. + self.is_sky_serve_controller_task = False + # Filled in by the optimizer. If None, this Task is not planned. self.best_resources = None # Check if the task is legal. diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index f9965a187d1..c8e6d983bad 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -1,4 +1,6 @@ -# The template for skyserve controller +# The template for the sky serve controller + +name: {{service_name}} setup: | # Install all cloud dependencies. @@ -12,3 +14,20 @@ setup: | file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} + +run: | + # Create working directory for this services. + mkdir -p {{service_dir}} + # Start sky serve controller. + python -u -m sky.serve.controller --service-name {{service_name}} \ + --task-yaml {{remote_task_yaml_path}} --controller-port {{controller_port}} \ + > {{controller_log_file}} 2>&1 & + # Wait for controller to start. + sleep 10 + # Start sky serve load balancer. We keep the load balancer running in the + # foreground so that the job will not finish, thus prevent our controller + # from auto down. + python -u -m sky.serve.load_balancer --task-yaml {{remote_task_yaml_path}} \ + --load-balancer-port {{load_balancer_port}} --app-port {{app_port}} \ + --controller-addr http://localhost:{{controller_port}} \ + > {{load_balancer_log_file}} 2>&1 diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index 7113fe2f265..db24fbb8a42 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -5,7 +5,7 @@ name: {{dag_name}} file_mounts: {{remote_user_yaml_prefix}}/{{dag_name}}-{{uuid}}.yaml: {{user_yaml_path}} {% if user_config_path is not none %} - {{remote_user_yaml_prefix}}/{{dag_name}}-{{uuid}}.config_yaml: {{user_config_path}} + {{remote_user_config_path}}: {{user_config_path}} {% endif %} setup: | @@ -36,21 +36,3 @@ run: | python -u -m sky.spot.controller \ {{remote_user_yaml_prefix}}/{{dag_name}}-{{uuid}}.yaml \ --job-id $SKYPILOT_INTERNAL_JOB_ID {% if retry_until_up %}--retry-until-up{% endif %} - -envs: - SKYPILOT_USER_ID: {{logging_user_hash}} - # skip cloud identity check for spot controller to avoid the overhead. - SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK: 1 - SKYPILOT_USER: {{user}} -{% if user_config_path is not none %} - {{env_var_skypilot_config}}: {{remote_user_yaml_prefix}}/{{dag_name}}-{{uuid}}.config_yaml -{% endif %} -{% if is_dev %} - SKYPILOT_DEV: 1 -{% endif %} -{% if is_debug %} - SKYPILOT_DEBUG: 1 -{% endif %} -{% if disable_logging %} - SKYPILOT_DISABLE_USAGE_COLLECTION: 1 -{% endif %} diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index d86d61eb160..8df9f28b5e3 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -124,8 +124,8 @@ def show_service_table(service_records: List[_ServiceRecord], show_all: bool): StatusColumn('UPTIME', _get_uptime), StatusColumn('STATUS', _get_service_status_colored), StatusColumn('REPLICAS', _get_replicas), - StatusColumn('CONTROLLER_CLUSTER_NAME', - _get_controller_cluster_name, + StatusColumn('CONTROLLER_NAME', + _get_controller_name, show_by_default=False), StatusColumn('ENDPOINT', _get_endpoint), StatusColumn('POLICY', _get_policy, show_by_default=False), @@ -398,9 +398,8 @@ def _get_service_handle( return service_record['handle'] -def _get_controller_cluster_name(service_record: _ServiceRecord) -> str: - handle = _get_service_handle(service_record) - return handle.controller_cluster_name +def _get_controller_name(service_record: _ServiceRecord) -> str: + return service_record['controller_name'] def _get_policy(service_record: _ServiceRecord) -> str: @@ -423,20 +422,20 @@ def _get_uptime(service_record: _ServiceRecord) -> str: def _get_replicas(service_record: _ServiceRecord) -> str: ready_replica_num = 0 - handle = _get_service_handle(service_record) - for info in handle.replica_info: + for info in service_record['replica_info']: if _get_status(info) == status_lib.ReplicaStatus.READY: ready_replica_num += 1 - total_replica_num = len(handle.replica_info) + total_replica_num = len(service_record['replica_info']) return f'{ready_replica_num}/{total_replica_num}' def _get_endpoint(service_record: _ServiceRecord) -> str: handle = _get_service_handle(service_record) - endpoint = handle.endpoint - if not endpoint: + if handle.endpoint_ip is None: + return '-' + if handle.load_balancer_port is None: return '-' - return endpoint + return f'{handle.endpoint_ip}:{handle.load_balancer_port}' def _get_service_status( From fdd63325700467b776c3ed163a721861c6d8c8b1 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sun, 10 Sep 2023 11:04:37 -0700 Subject: [PATCH 042/223] Update sky/cli.py Co-authored-by: Zongheng Yang --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 39109bb795b..ccb69a6f7d8 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4123,7 +4123,7 @@ def serve_up( click.echo() if not yes: - prompt = f'Launching a new service {service_name}. Proceed?' + prompt = f'Launching a new service {service_name!r}. Proceed?' if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) From a93d9fa7e9c6011669f45b56148cbca6fd785e62 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sun, 10 Sep 2023 11:29:23 -0700 Subject: [PATCH 043/223] Update sky/serve/examples/stable_diffusion_service.yaml Co-authored-by: Zongheng Yang --- sky/serve/examples/stable_diffusion_service.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/serve/examples/stable_diffusion_service.yaml b/sky/serve/examples/stable_diffusion_service.yaml index 4b4c86b3142..c96ee9c77ed 100644 --- a/sky/serve/examples/stable_diffusion_service.yaml +++ b/sky/serve/examples/stable_diffusion_service.yaml @@ -1,4 +1,7 @@ -#SkyPilot YAML to run stable diffusion web tool on 1 V100 GPU. +# SkyPilot YAML to run stable diffusion web tool on 1 V100 GPU. +# +# Usage: +# .. resources: accelerators: V100:1 From 87b2079f5f522ba849b27e9744cc517b47fec13d Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 11:31:48 -0700 Subject: [PATCH 044/223] style for example --- sky/serve/examples/gorilla/run_gorilla.py | 24 +++++------ sky/serve/examples/http_server/server.py | 10 ++--- sky/serve/examples/llama2/chat.py | 42 +++++++++---------- .../misc/cancel/send_cancel_request.py | 6 +-- sky/serve/examples/misc/cancel/server.py | 8 ++-- 5 files changed, 45 insertions(+), 45 deletions(-) diff --git a/sky/serve/examples/gorilla/run_gorilla.py b/sky/serve/examples/gorilla/run_gorilla.py index ce5199a5434..1d41106e5cf 100644 --- a/sky/serve/examples/gorilla/run_gorilla.py +++ b/sky/serve/examples/gorilla/run_gorilla.py @@ -5,39 +5,39 @@ import openai -openai.api_key = "EMPTY" # Key is ignored and does not matter +openai.api_key = 'EMPTY' # Key is ignored and does not matter # SkyServe endpoint -endpoint = input("Enter SkyServe endpoint: ") +endpoint = input('Enter SkyServe endpoint: ') # endpoint = '34.132.127.197:8000' -openai.api_base = f"http://{endpoint}/v1" +openai.api_base = f'http://{endpoint}/v1' # Report issues def raise_issue(e, model, prompt): - issue_title = urllib.parse.quote("[bug] Hosted Gorilla: ") + issue_title = urllib.parse.quote('[bug] Hosted Gorilla: ') issue_body = urllib.parse.quote( - f"Exception: {e}\nFailed model: {model}, for prompt: {prompt}") - issue_url = f"https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-gorilla&projects=&template=hosted-gorilla-.md&title={issue_title}&body={issue_body}" + f'Exception: {e}\nFailed model: {model}, for prompt: {prompt}') + issue_url = f'https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-gorilla&projects=&template=hosted-gorilla-.md&title={issue_title}&body={issue_body}' print( - f"An exception has occurred: {e} \nPlease raise an issue here: {issue_url}" + f'An exception has occurred: {e} \nPlease raise an issue here: {issue_url}' ) # Query Gorilla server -def get_gorilla_response(prompt, model="gorilla-mpt-7b-hf-v0"): +def get_gorilla_response(prompt, model='gorilla-mpt-7b-hf-v0'): try: completion = openai.ChatCompletion.create(model=model, messages=[{ - "role": "user", - "content": prompt + 'role': 'user', + 'content': prompt }]) return completion.choices[0].message.content except Exception as e: raise_issue(e, model, prompt) -prompt = "I would like to translate 'I feel very good today.' from English to Chinese." +prompt = 'I would like to translate "I feel very good today." from English to Chinese.' print(get_gorilla_response(prompt)) -prompt = "I want to build a robot that can detecting objects in an image ‘cat.jpeg’. Input: [‘cat.jpeg’]" +prompt = 'I want to build a robot that can detecting objects in an image ‘cat.jpeg’. Input: [‘cat.jpeg’]' print(get_gorilla_response(prompt)) diff --git a/sky/serve/examples/http_server/server.py b/sky/serve/examples/http_server/server.py index 967e8bd73d8..dacf2654b43 100644 --- a/sky/serve/examples/http_server/server.py +++ b/sky/serve/examples/http_server/server.py @@ -11,7 +11,7 @@ def do_GET(self): self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() - html = ''' + html = """ SkyPilot Test Page @@ -20,17 +20,17 @@ def do_GET(self):

Hi, SkyPilot here!

- ''' + """ self.wfile.write(bytes(html, 'utf8')) return -if __name__ == "__main__": +if __name__ == '__main__': parser = argparse.ArgumentParser(description='SkyServe HTTP Test Server') parser.add_argument('--port', type=int, required=False, default=8081) args = parser.parse_args() Handler = MyHttpRequestHandler - with socketserver.TCPServer(("", args.port), Handler) as httpd: - print("serving at port", args.port) + with socketserver.TCPServer(('', args.port), Handler) as httpd: + print('serving at port', args.port) httpd.serve_forever() diff --git a/sky/serve/examples/llama2/chat.py b/sky/serve/examples/llama2/chat.py index ba2f59658f5..ed4468ebf6d 100644 --- a/sky/serve/examples/llama2/chat.py +++ b/sky/serve/examples/llama2/chat.py @@ -4,40 +4,40 @@ import requests stream = True -model = "Llama-2-7b-chat-hf" -init_prompt = "You are a helpful assistant." -history = [{"role": "system", "content": init_prompt}] -endpoint = input("Endpoint: ") -url = f"http://{endpoint}/v1/chat/completions" -openai.api_base = f"http://{endpoint}/v1" -openai.api_key = "placeholder" +model = 'Llama-2-7b-chat-hf' +init_prompt = 'You are a helpful assistant.' +history = [{'role': 'system', 'content': init_prompt}] +endpoint = input('Endpoint: ') +url = f'http://{endpoint}/v1/chat/completions' +openai.api_base = f'http://{endpoint}/v1' +openai.api_key = 'placeholder' try: while True: - user_input = input("[User] ") - history.append({"role": "user", "content": user_input}) + user_input = input('[User] ') + history.append({'role': 'user', 'content': user_input}) if stream: resp = openai.ChatCompletion.create(model=model, messages=history, stream=True) - print("[Chatbot]", end="", flush=True) - tot = "" + print('[Chatbot]', end='', flush=True) + tot = '' for i in resp: - dlt = i["choices"][0]["delta"] - if "content" not in dlt: + dlt = i['choices'][0]['delta'] + if 'content' not in dlt: continue - print(dlt["content"], end="", flush=True) - tot += dlt["content"] + print(dlt['content'], end='', flush=True) + tot += dlt['content'] print() - history.append({"role": "assistant", "content": tot}) + history.append({'role': 'assistant', 'content': tot}) else: resp = requests.post(url, data=json.dumps({ - "model": model, - "messages": history + 'model': model, + 'messages': history })) - msg = resp.json()["choices"][0]["message"] - print("[Chatbot]" + msg["content"]) + msg = resp.json()['choices'][0]['message'] + print('[Chatbot]' + msg['content']) history.append(msg) except KeyboardInterrupt: - print("\nBye!") + print('\nBye!') diff --git a/sky/serve/examples/misc/cancel/send_cancel_request.py b/sky/serve/examples/misc/cancel/send_cancel_request.py index 76f41f43baf..b75cdaba16d 100644 --- a/sky/serve/examples/misc/cancel/send_cancel_request.py +++ b/sky/serve/examples/misc/cancel/send_cancel_request.py @@ -8,10 +8,10 @@ async def fetch(session, url): try: async with session.get(url) as response: - print("Got response!") + print('Got response!') return await response.text() except asyncio.CancelledError: - print("Request was cancelled!") + print('Request was cancelled!') raise @@ -30,7 +30,7 @@ async def main(): try: await task except asyncio.CancelledError: - print("Main function caught the cancelled exception.") + print('Main function caught the cancelled exception.') asyncio.run(main()) diff --git a/sky/serve/examples/misc/cancel/server.py b/sky/serve/examples/misc/cancel/server.py index dd3491f3a27..cf50e85ab49 100644 --- a/sky/serve/examples/misc/cancel/server.py +++ b/sky/serve/examples/misc/cancel/server.py @@ -11,20 +11,20 @@ async def handle(request): try: # Simulate a computation that takes 10 seconds for i in range(10): - print("Computing... step", i) + print('Computing... step', i) await asyncio.sleep(1) await response.write(b' ') # Sending a space as a heartbeat await response.write(b'Completed after 10 seconds.') except (asyncio.CancelledError, ConnectionResetError): - print("Client disconnected, stopping computation.") + print('Client disconnected, stopping computation.') return response return response async def health_check(request): - print("Received health check") - return web.Response(text="Healthy") + print('Received health check') + return web.Response(text='Healthy') if __name__ == '__main__': From 3dc4dacf7c600a310d1b781dd91cdb34402b3dac Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 11:32:15 -0700 Subject: [PATCH 045/223] format & add comments & rephrase --- sky/cli.py | 11 +++++------ sky/execution.py | 28 +++++++++++++++------------- sky/serve/constants.py | 23 +++++++++++++++++++++++ sky/serve/service_spec.py | 8 ++++---- 4 files changed, 47 insertions(+), 23 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index ccb69a6f7d8..587968f3097 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4039,7 +4039,9 @@ def serve_up( prompt = (f'Service {service_name!r} has failed. ' 'Please clean up the service and try again.') else: - prompt = f'Service {service_name!r} already exists.' + prompt = (f'Service {service_name!r} already exists. ' + 'Updating a service will be supported in the future. ' + 'For now, `sky serve down` first and try again.') click.secho(prompt, fg='red') return @@ -4326,16 +4328,13 @@ def _down_service(name: str): sky.serve_down(name, purge) except RuntimeError as e: message = ( - f'{colorama.Fore.RED}Tearing down service {name}...failed. ' + f'{colorama.Fore.RED}Tearing down service {name!r}...failed. ' 'Please manually clean up the replicas and then use --purge ' f'to clean up the controller.{colorama.Style.RESET_ALL}' f'\nReason: {common_utils.format_exception(e)}.') - except (exceptions.NotSupportedError, - exceptions.ClusterOwnerIdentityMismatchError) as e: - message = str(e) else: message = ( - f'{colorama.Fore.GREEN}Tearing down service {name}...done.' + f'{colorama.Fore.GREEN}Tearing down service {name!r}...done.' f'{colorama.Style.RESET_ALL}') success_progress = True diff --git a/sky/execution.py b/sky/execution.py index e680c48198a..ee2ef2125e5 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1116,7 +1116,7 @@ def serve_up( fore = colorama.Fore style = colorama.Style - print(f'\n{fore.YELLOW}Launching controller for {service_name}...' + print(f'\n{fore.YELLOW}Launching controller for {service_name!r}...' f'{style.RESET_ALL}') job_id = _execute( entrypoint=controller_task, @@ -1172,7 +1172,7 @@ def _wait_until_job_is_running_on_controller( service_handle.job_id = job_id global_user_state.set_service_handle(service_name, service_handle) - print(f'{fore.GREEN}Launching controller for {service_name}...done.' + print(f'{fore.GREEN}Launching controller for {service_name!r}...done.' f'{style.RESET_ALL}') global_user_state.set_service_status( @@ -1200,7 +1200,7 @@ def _wait_until_job_is_running_on_controller( print(f'\n{style.BRIGHT}{fore.CYAN}Endpoint URL: ' f'{style.RESET_ALL}{fore.CYAN}' f'{handle.head_ip}:{load_balancer_port}{style.RESET_ALL}') - print(f'{fore.GREEN}Starting replica now...{style.RESET_ALL}') + print(f'{fore.GREEN}Starting replicas now...{style.RESET_ALL}') print('Please use the above command to find the latest status.') @@ -1235,7 +1235,8 @@ def serve_down( if service_handle.controller_port is None: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - f'Controller job of service {service_name} not found.') + f'Controller job of service {service_name!r} not found.' + ) code = serve.ServeCodeGen.terminate_service( service_handle.controller_port) @@ -1248,7 +1249,7 @@ def serve_down( subprocess_utils.handle_returncode( returncode, code, ('Failed when submit terminate request to controller ' - f'of service {service_name}'), + f'of service {service_name!r}'), stderr, stream_logs=False) @@ -1256,13 +1257,13 @@ def serve_down( terminate_service_payload) if resp.status_code != 200: raise RuntimeError('Failed to terminate replica of service ' - f'{service_name} due to request ' + f'{service_name!r} due to request ' f'failure: {resp.text}') msg = resp.json()['message'] if msg: raise RuntimeError( 'Unexpected message when tearing down replica of service ' - f'{service_name}: {msg}. Please login to the controller ' + f'{service_name!r}: {msg}. Please login to the controller ' 'and make sure the service is properly cleaned.') # We want to make sure no matter what error happens, we can still @@ -1270,14 +1271,14 @@ def serve_down( except Exception as e: # pylint: disable=broad-except if purge: logger.warning('Ignoring error when cleaning replicas of ' - f'{service_name}: {e}') + f'{service_name!r}: {e}') else: raise RuntimeError(e) from e else: if not purge: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - f'Cannot find controller of service {service_name}.') + f'Cannot find controller of service {service_name!r}.') try: if handle is not None: @@ -1303,16 +1304,17 @@ def serve_down( separate_stderr=True) subprocess_utils.handle_returncode( returncode, - code, ('Failed when cleaning up utility files on controller ' - f'of service {service_name}'), + code, ('Failed when cleaning up service files on controller ' + f'of service {service_name!r}'), stderr, stream_logs=False) # same as above. except Exception as e: # pylint: disable=broad-except if purge: - logger.warning('Ignoring error when stopping controller and ' - f'load balancer jobs of service {service_name}: {e}') + logger.warning( + 'Ignoring error when stopping controller and ' + f'load balancer jobs of service {service_name!r}: {e}') else: raise RuntimeError(e) from e diff --git a/sky/serve/constants.py b/sky/serve/constants.py index d00ea4d2689..6ae672c94e7 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -2,6 +2,9 @@ CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 +# A prefix for all controller clusters. We use this prefix to identify a +# skyserve controller cluster. We will append a user hash and an incremental +# id to this prefix to generate a unique controller cluster name every time. CONTROLLER_PREFIX = 'sky-serve-controller-' CONTROLLER_TEMPLATE = 'sky-serve-controller.yaml.j2' @@ -15,14 +18,29 @@ # In both cases, service name shares the same regex with cluster name. SERVICE_NAME_VALID_REGEX = '[a-z]([-a-z0-9]*[a-z0-9])?' +# The filelock for selecting controller and service ports when starting a +# service. In our current multi-service controller implementation, we need to: +# 1. Select a controller if there are some existing controllers; +# 2. Select ports for each service atomically to avoid port conflicts. +# All of them are protected by this file lock from race conditions. CONTROLLER_FILE_LOCK_PATH = f'{SERVE_PREFIX}/controller.lock' CONTROLLER_FILE_LOCK_TIMEOUT = 20 +# The time interval for load balancer to sync with controller. Every time the +# load balancer syncs with controller, it will update all available replica ips +# for each service, also send the number of requests in last query interval. CONTROLLER_SYNC_INTERVAL = 20 + +# The default timeout for a readiness probe request. We set the timeout to 15s +# since using actual generation in LLM services as readiness probe is very +# time-consuming (33B, 70B, ...). +# TODO(tian): Expose this option to users in yaml file. READINESS_PROBE_TIMEOUT = 15 +# The time to wait for a service to start up when we start a service. SERVE_STARTUP_TIMEOUT = 60 +# The default controller resources. # We need 200 GB disk space to enable using Azure as controller, since its image # size is 150 GB. Also, we need 32 GB memory to run our controller and load # balancer jobs since it is very memory demanding. @@ -45,6 +63,11 @@ DEFAULT_INITIAL_DELAY_SECONDS = 1200 DEFAULT_MIN_REPLICAS = 1 +# Default port range start for controller and load balancer. Ports will be +# automatically generated from this start port. CONTROLLER_PORT_START = 20001 LOAD_BALANCER_PORT_START = 30001 + +# Ports to open for controller VM. We open ~1000 ports for controller to ensure +# services can be started on the same controller without port conflicts. LOAD_BALANCER_PORT_RANGE = '30001-31000' diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 2d694ef36fb..128ac1e34ab 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -193,10 +193,10 @@ def policy_str(self): def __repr__(self) -> str: return textwrap.dedent(f"""\ - Readiness probe method: {self.probe_str()} - Replica autoscaling policy: {self.policy_str()} - Service initial delay seconds: {self.initial_delay_seconds} - Replica auto restart: {self.auto_restart} + Readiness probe method: {self.probe_str()} + Readiness initial delay seconds: {self.initial_delay_seconds} + Replica autoscaling policy: {self.policy_str()} + Replica auto restart: {self.auto_restart} Please refer to SkyPilot Serve document for detailed explanations. """) From 9850d40a9b10fe199cd90c78002f3d85eb8af0c5 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 13:16:03 -0700 Subject: [PATCH 046/223] move serve_down to core and refactor --- sky/__init__.py | 2 +- sky/cli.py | 2 +- sky/core.py | 259 +++++++++++++++++++++++++++++++++++------------ sky/execution.py | 125 ----------------------- 4 files changed, 196 insertions(+), 192 deletions(-) diff --git a/sky/__init__.py b/sky/__init__.py index 9e3a42dcbdc..15a0f69128d 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -18,6 +18,7 @@ from sky.core import download_logs from sky.core import job_status from sky.core import queue +from sky.core import serve_down from sky.core import spot_cancel from sky.core import spot_queue from sky.core import spot_status @@ -33,7 +34,6 @@ from sky.data import StoreType from sky.execution import exec # pylint: disable=redefined-builtin from sky.execution import launch -from sky.execution import serve_down from sky.execution import serve_up from sky.execution import spot_launch from sky.optimizer import Optimizer diff --git a/sky/cli.py b/sky/cli.py index 380efd905f2..55831d612db 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4227,7 +4227,7 @@ def serve_status(all: bool, service_name: Optional[str]): # Only show status of my-service sky serve status my-service """ - service_records = core.service_status(service_name) + service_records = core.serve_status(service_name) if service_name is not None and not service_records: click.secho(f'Service {service_name!r} not found.', fg='red') return diff --git a/sky/core.py b/sky/core.py index 0f2b2cbf32b..4682cbe0e47 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1,7 +1,7 @@ """SDK functions for cluster/job management.""" import getpass +import os import sys -import typing from typing import Any, Dict, List, Optional, Union import colorama @@ -12,6 +12,7 @@ from sky import data from sky import exceptions from sky import global_user_state +from sky import serve from sky import sky_logging from sky import spot from sky import status_lib @@ -25,9 +26,6 @@ from sky.utils import tpu_utils from sky.utils import ux_utils -if typing.TYPE_CHECKING: - from sky import serve - logger = sky_logging.init_logger(__name__) # ====================== @@ -113,67 +111,6 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None, cluster_names=cluster_names) -@usage_lib.entrypoint -def service_status(service_name: Optional[str]) -> List[Dict[str, Any]]: - return backend_utils.refresh_service_status(service_name) - - -@usage_lib.entrypoint -def serve_tail_logs(service_name: str, - controller: bool = False, - load_balancer: bool = False, - replica_id: Optional[int] = None, - follow: bool = True) -> None: - """Tail logs for a service. - - Usage: - core.serve_tail_logs(service_name, =, follow=True/False) - - One and only one of must be specified: controller, load_balancer, - or replica_id. - - To tail controller logs: - # follow default to True - core.serve_tail_logs(service_name, controller=True) - - To print replica 3 logs: - core.serve_tail_logs(service_name, replica_id=3, follow=False) - """ - have_replica_id = replica_id is not None - if (controller + load_balancer + have_replica_id) != 1: - with ux_utils.print_exception_no_traceback(): - raise ValueError('One and only one of controller, load_balancer, ' - 'or replica_id must be specified.') - service_record = global_user_state.get_service_from_name(service_name) - if service_record is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name!r} does not exist. ' - 'Cannot stream logs.') - if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'Service {service_name!r} is still initializing its ' - 'controller. Please try again later.') - if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name!r}\'s controller failed. ' - 'Cannot tail logs.') - service_handle: 'serve.ServiceHandle' = service_record['handle'] - controller_name = service_record['controller_name'] - handle = global_user_state.get_handle_from_cluster_name(controller_name) - if handle is None: - raise ValueError(f'Cannot find controller for service {service_name}.') - assert isinstance(handle, backends.CloudVmRayResourceHandle), handle - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend), backend - backend.tail_serve_logs(handle, - service_handle, - controller, - load_balancer, - replica_id, - follow=follow) - - @usage_lib.entrypoint def cost_report() -> List[Dict[str, Any]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. @@ -1044,3 +981,195 @@ def storage_delete(name: str) -> None: source=handle.source, sync_on_reconstruction=False) store_object.delete() + + +# ====================== +# = Service Management = +# ====================== + + +@usage_lib.entrypoint +def serve_status(service_name: Optional[str]) -> List[Dict[str, Any]]: + return backend_utils.refresh_service_status(service_name) + + +@usage_lib.entrypoint +def serve_tail_logs(service_name: str, + controller: bool = False, + load_balancer: bool = False, + replica_id: Optional[int] = None, + follow: bool = True) -> None: + """Tail logs for a service. + + Usage: + core.serve_tail_logs(service_name, =, follow=True/False) + + One and only one of must be specified: controller, load_balancer, + or replica_id. + + To tail controller logs: + # follow default to True + core.serve_tail_logs(service_name, controller=True) + + To print replica 3 logs: + core.serve_tail_logs(service_name, replica_id=3, follow=False) + """ + have_replica_id = replica_id is not None + if (controller + load_balancer + have_replica_id) != 1: + with ux_utils.print_exception_no_traceback(): + raise ValueError('One and only one of controller, load_balancer, ' + 'or replica_id must be specified.') + service_record = global_user_state.get_service_from_name(service_name) + if service_record is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name!r} does not exist. ' + 'Cannot stream logs.') + if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Service {service_name!r} is still initializing its ' + 'controller. Please try again later.') + if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name!r}\'s controller failed. ' + 'Cannot tail logs.') + service_handle: serve.ServiceHandle = service_record['handle'] + controller_name = service_record['controller_name'] + handle = global_user_state.get_handle_from_cluster_name(controller_name) + if handle is None: + raise ValueError(f'Cannot find controller for service {service_name}.') + assert isinstance(handle, backends.CloudVmRayResourceHandle), handle + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend), backend + backend.tail_serve_logs(handle, + service_handle, + controller, + load_balancer, + replica_id, + follow=follow) + + +@usage_lib.entrypoint +def serve_down( + service_name: str, + purge: bool, +): + """Teardown a service. + + Please refer to the sky.cli.serve_down for the document. + + Args: + service_name: Name of the service. + purge: If true, ignore errors when cleaning up the controller. + """ + service_record = global_user_state.get_service_from_name(service_name) + + if service_record is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name!r} not found.') + + service_handle: serve.ServiceHandle = service_record['handle'] + controller_name = service_record['controller_name'] + global_user_state.set_service_status(service_name, + status_lib.ServiceStatus.SHUTTING_DOWN) + handle = global_user_state.get_handle_from_cluster_name(controller_name) + + if handle is not None: + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + try: + if service_handle.controller_port is None: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'Controller job of service {service_name!r} not found.' + ) + + code = serve.ServeCodeGen.terminate_service( + service_handle.controller_port) + returncode, terminate_service_payload, stderr = backend.run_on_head( + handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + subprocess_utils.handle_returncode( + returncode, + code, ('Failed when submit terminate request to controller ' + f'of service {service_name!r}'), + stderr, + stream_logs=False) + + resp = serve.load_terminate_service_result( + terminate_service_payload) + if resp.status_code != 200: + raise RuntimeError('Failed to terminate replica of service ' + f'{service_name!r} due to request ' + f'failure: {resp.text}') + msg = resp.json()['message'] + if msg: + raise RuntimeError( + 'Unexpected message when tearing down replica of service ' + f'{service_name!r}: {msg}. Please login to the controller ' + 'and make sure the service is properly cleaned.') + + # We want to make sure no matter what error happens, we can still + # clean up the record if purge is True. + except Exception as e: # pylint: disable=broad-except + if purge: + logger.warning('Ignoring error when cleaning replicas of ' + f'{service_name!r}: {e}') + else: + raise RuntimeError(e) from e + else: + if not purge: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'Cannot find controller of service {service_name!r}.') + + try: + if handle is not None: + assert isinstance(handle, backends.CloudVmRayResourceHandle) + backend = backends.CloudVmRayBackend() + + # Cancel the controller and load balancer jobs. + # For the case when controller / load_balancer job failed to submit. + jobs = [] + if service_handle.job_id is not None: + jobs.append(service_handle.job_id) + backend.cancel_jobs(handle, jobs=jobs, silent=True) + + # Cleanup all files on controller related to this service. + # We have a 10-min grace period for the controller to autostop, + # so it should be fine if this is the last service on the + # controller and its job is the only one running. + code = serve.ServeCodeGen.cleanup_service_files(service_name) + returncode, _, stderr = backend.run_on_head(handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + subprocess_utils.handle_returncode( + returncode, + code, ('Failed when cleaning up service files on controller ' + f'of service {service_name!r}'), + stderr, + stream_logs=False) + + # same as above. + except Exception as e: # pylint: disable=broad-except + if purge: + logger.warning( + 'Ignoring error when stopping controller and ' + f'load balancer jobs of service {service_name!r}: {e}') + else: + raise RuntimeError(e) from e + + # TODO(tian): Maybe add a post_cleanup function? + controller_yaml_path = serve.generate_controller_yaml_file_name( + service_name) + if os.path.exists(controller_yaml_path): + os.remove(controller_yaml_path) + handle = global_user_state.get_handle_from_service_name(service_name) + assert handle is not None + handle.cleanup_ephemeral_storage() + global_user_state.remove_service(service_name) diff --git a/sky/execution.py b/sky/execution.py index 4cfa3f709e6..a503db449a2 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1193,128 +1193,3 @@ def _wait_until_job_is_running_on_controller( f'{handle.head_ip}:{load_balancer_port}{style.RESET_ALL}') print(f'{fore.GREEN}Starting replicas now...{style.RESET_ALL}') print('Please use the above command to find the latest status.') - - -def serve_down( - service_name: str, - purge: bool, -): - """Teardown a service. - - Please refer to the sky.cli.serve_down for the document. - - Args: - service_name: Name of the service. - purge: If true, ignore errors when cleaning up the controller. - """ - service_record = global_user_state.get_service_from_name(service_name) - - if service_record is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name!r} not found.') - - service_handle: serve.ServiceHandle = service_record['handle'] - controller_name = service_record['controller_name'] - global_user_state.set_service_status(service_name, - status_lib.ServiceStatus.SHUTTING_DOWN) - handle = global_user_state.get_handle_from_cluster_name(controller_name) - - if handle is not None: - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend) - try: - if service_handle.controller_port is None: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'Controller job of service {service_name!r} not found.' - ) - - code = serve.ServeCodeGen.terminate_service( - service_handle.controller_port) - returncode, terminate_service_payload, stderr = backend.run_on_head( - handle, - code, - require_outputs=True, - stream_logs=False, - separate_stderr=True) - subprocess_utils.handle_returncode( - returncode, - code, ('Failed when submit terminate request to controller ' - f'of service {service_name!r}'), - stderr, - stream_logs=False) - - resp = serve.load_terminate_service_result( - terminate_service_payload) - if resp.status_code != 200: - raise RuntimeError('Failed to terminate replica of service ' - f'{service_name!r} due to request ' - f'failure: {resp.text}') - msg = resp.json()['message'] - if msg: - raise RuntimeError( - 'Unexpected message when tearing down replica of service ' - f'{service_name!r}: {msg}. Please login to the controller ' - 'and make sure the service is properly cleaned.') - - # We want to make sure no matter what error happens, we can still - # clean up the record if purge is True. - except Exception as e: # pylint: disable=broad-except - if purge: - logger.warning('Ignoring error when cleaning replicas of ' - f'{service_name!r}: {e}') - else: - raise RuntimeError(e) from e - else: - if not purge: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'Cannot find controller of service {service_name!r}.') - - try: - if handle is not None: - assert isinstance(handle, backends.CloudVmRayResourceHandle) - backend = backends.CloudVmRayBackend() - - # Cancel the controller and load balancer jobs. - # For the case when controller / load_balancer job failed to submit. - jobs = [] - if service_handle.job_id is not None: - jobs.append(service_handle.job_id) - backend.cancel_jobs(handle, jobs=jobs, silent=True) - - # Cleanup all files on controller related to this service. - # We have a 10-min grace period for the controller to autostop, - # so it should be fine if this is the last service on the - # controller and its job is the only one running. - code = serve.ServeCodeGen.cleanup_service_files(service_name) - returncode, _, stderr = backend.run_on_head(handle, - code, - require_outputs=True, - stream_logs=False, - separate_stderr=True) - subprocess_utils.handle_returncode( - returncode, - code, ('Failed when cleaning up service files on controller ' - f'of service {service_name!r}'), - stderr, - stream_logs=False) - - # same as above. - except Exception as e: # pylint: disable=broad-except - if purge: - logger.warning( - 'Ignoring error when stopping controller and ' - f'load balancer jobs of service {service_name!r}: {e}') - else: - raise RuntimeError(e) from e - - # TODO(tian): Maybe add a post_cleanup function? - controller_yaml_path = serve.generate_controller_yaml_file_name( - service_name) - if os.path.exists(controller_yaml_path): - os.remove(controller_yaml_path) - handle = global_user_state.get_handle_from_service_name(service_name) - assert handle is not None - handle.cleanup_ephemeral_storage() - global_user_state.remove_service(service_name) From 2fb615d4c0f18291e23320a0f8de2c37effc2e44 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sun, 10 Sep 2023 13:25:47 -0700 Subject: [PATCH 047/223] Update sky/execution.py Co-authored-by: Zhanghao Wu --- sky/execution.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index a503db449a2..ead790c808f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1067,8 +1067,7 @@ def serve_up( with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', mode='w') as f: task_config = task.to_yaml_config() - if 'resources' in task_config and 'spot_recovery' in task_config[ - 'resources']: + if ('resources' in task_config and 'spot_recovery' in task_config['resources']): del task_config['resources']['spot_recovery'] common_utils.dump_yaml(f.name, task_config) remote_task_yaml_path = serve.generate_remote_task_yaml_file_name( From aa4cda061e494914793e985e48415e8268993d9f Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 13:49:03 -0700 Subject: [PATCH 048/223] apply suggestions from code review --- sky/backends/backend_utils.py | 17 ++--------------- sky/cli.py | 29 ++++++++++++++--------------- sky/execution.py | 10 +++++++--- sky/global_user_state.py | 6 ++---- sky/serve/__init__.py | 1 + sky/serve/serve_utils.py | 13 +++++++++++++ 6 files changed, 39 insertions(+), 37 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 2202b3a558e..bca4b7ac428 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,5 +1,4 @@ """Util constants/functions for the backends.""" -import collections import copy from datetime import datetime import difflib @@ -1483,7 +1482,7 @@ def generate_cluster_name(): def generate_service_name(): - return f'service-{uuid.uuid4().hex[:4]}' + return f'sky-service-{uuid.uuid4().hex[:4]}' def get_cleaned_username(username: str = '') -> str: @@ -2645,18 +2644,6 @@ def _refresh_cluster(cluster_name): return kept_records -def _service_status_from_replica_info( - replica_info: List[Dict[str, Any]]) -> status_lib.ServiceStatus: - status2num = collections.Counter([i['status'] for i in replica_info]) - # If one replica is READY, the service is READY. - if status2num[status_lib.ReplicaStatus.READY] > 0: - return status_lib.ServiceStatus.READY - if sum(status2num[status] - for status in status_lib.ReplicaStatus.failed_statuses()) > 0: - return status_lib.ServiceStatus.FAILED - return status_lib.ServiceStatus.REPLICA_INIT - - def _refresh_service_record_no_lock( service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: """Refresh the service, and return the possibly updated record. @@ -2731,7 +2718,7 @@ def _refresh_service_record_no_lock( status_lib.ServiceStatus.SHUTTING_DOWN, status_lib.ServiceStatus.CONTROLLER_INIT, ]: - local_record['status'] = _service_status_from_replica_info( + local_record['status'] = serve_lib.replica_info_to_service_status( latest_info['replica_info']) global_user_state.add_or_update_service(**local_record) diff --git a/sky/cli.py b/sky/cli.py index 55831d612db..d6abf62ae84 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4049,16 +4049,15 @@ def serve_up( prompt = (f'Service {service_name!r} already exists. ' 'Updating a service will be supported in the future. ' 'For now, `sky serve down` first and try again.') - click.secho(prompt, fg='red') - return + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(prompt) shell_splits = shlex.split(entrypoint) yaml_file_provided = (len(shell_splits) == 1 and (shell_splits[0].endswith('yaml') or shell_splits[0].endswith('.yml'))) if not yaml_file_provided: - click.secho('ENTRYPOINT must points to a valid YAML file.', fg='red') - return + raise click.UsageError('ENTRYPOINT must points to a valid YAML file.') is_yaml = True config: Optional[List[Dict[str, Any]]] = None @@ -4096,22 +4095,21 @@ def serve_up( ' path is correct.') is_yaml = False if not is_yaml: - click.secho( - f'{entrypoint!r} looks like a yaml path but {invalid_reason}', - fg='red') - return + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'{entrypoint!r} looks like a yaml path but {invalid_reason}') click.secho('Service from YAML spec: ', fg='yellow', nl=False) click.secho(entrypoint, bold=True) usage_lib.messages.usage.update_user_task_yaml(entrypoint) dag = dag_utils.load_chain_dag_from_yaml(entrypoint) if len(dag.tasks) > 1: - click.secho('Multiple tasks found in the YAML file.', fg='red') - return + with ux_utils.print_exception_no_traceback(): + raise ValueError('Multiple tasks found in the YAML file.') task: sky.Task = dag.tasks[0] if task.service is None: - click.secho('Service section not found in the YAML file.', fg='red') - return + with ux_utils.print_exception_no_traceback(): + raise ValueError('Service section not found in the YAML file.') assert len(task.resources) == 1 requested_resources = list(task.resources)[0] if requested_resources.ports is not None: @@ -4125,7 +4123,8 @@ def serve_up( click.secho('Service Spec:', fg='cyan') click.echo(task.service) - click.secho('Each replica will use the following resource:', fg='cyan') + click.secho('Each replica will use the following resource (estimated):', + fg='cyan') with sky.Dag() as dag: dag.add(task) sky.optimize(dag) @@ -4229,8 +4228,8 @@ def serve_status(all: bool, service_name: Optional[str]): """ service_records = core.serve_status(service_name) if service_name is not None and not service_records: - click.secho(f'Service {service_name!r} not found.', fg='red') - return + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name!r} not found.') click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' f'{colorama.Style.RESET_ALL}') status_utils.show_service_table(service_records, all) diff --git a/sky/execution.py b/sky/execution.py index ead790c808f..abba9b31852 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1038,8 +1038,11 @@ def serve_up( controller_name, _ = serve.get_available_controller_name( controller_resources) global_user_state.add_or_update_service( - service_name, None, controller_name, service_handle, - status_lib.ServiceStatus.CONTROLLER_INIT) + service_name, + launched_at=int(time.time()), + controller_name=controller_name, + handle=service_handle, + status=status_lib.ServiceStatus.CONTROLLER_INIT) controller_port, load_balancer_port = ( serve.gen_ports_for_serve_process(controller_name)) @@ -1067,7 +1070,8 @@ def serve_up( with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', mode='w') as f: task_config = task.to_yaml_config() - if ('resources' in task_config and 'spot_recovery' in task_config['resources']): + if ('resources' in task_config and + 'spot_recovery' in task_config['resources']): del task_config['resources']['spot_recovery'] common_utils.dump_yaml(f.name, task_config) remote_task_yaml_path = serve.generate_remote_task_yaml_file_name( diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 6cf719d8e69..746e8f5d26f 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -281,11 +281,9 @@ def add_or_update_cluster(cluster_name: str, _DB.conn.commit() -def add_or_update_service(name: str, launched_at: Optional[int], - controller_name: str, handle: 'serve.ServiceHandle', +def add_or_update_service(name: str, launched_at: int, controller_name: str, + handle: 'serve.ServiceHandle', status: status_lib.ServiceStatus) -> None: - if launched_at is None: - launched_at = int(time.time()) _DB.cursor.execute( 'INSERT or REPLACE INTO services' '(name, launched_at, controller_name, handle, status) ' diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 7a581ac4c50..ffabbf4d947 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -23,6 +23,7 @@ from sky.serve.serve_utils import get_available_controller_name from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import load_terminate_service_result +from sky.serve.serve_utils import replica_info_to_service_status from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceHandle from sky.serve.service_spec import SkyServiceSpec diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index e580029a12a..8f05b81c364 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -1,5 +1,6 @@ """User interface with the SkyServe.""" import base64 +import collections import os import pickle import re @@ -279,6 +280,18 @@ def get_available_controller_name( key=lambda k: available_controller_to_service_num[k]), False +def replica_info_to_service_status( + replica_info: List[Dict[str, Any]]) -> status_lib.ServiceStatus: + status2num = collections.Counter([i['status'] for i in replica_info]) + # If one replica is READY, the service is READY. + if status2num[status_lib.ReplicaStatus.READY] > 0: + return status_lib.ServiceStatus.READY + if sum(status2num[status] + for status in status_lib.ReplicaStatus.failed_statuses()) > 0: + return status_lib.ServiceStatus.FAILED + return status_lib.ServiceStatus.REPLICA_INIT + + class ServiceHandle(object): """A pickle-able tuple of: From 3db51ca377006b6e1d0dc8edff8464751460d48b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 13:52:57 -0700 Subject: [PATCH 049/223] move examples --- {sky/serve/examples => examples/serve}/gorilla/gorilla.yaml | 0 {sky/serve/examples => examples/serve}/gorilla/run_gorilla.py | 0 .../serve}/hello_skyserve/hello_skyserve.yaml | 0 .../examples => examples/serve}/hello_skyserve/index.html | 0 {sky/serve/examples => examples/serve}/http_server/server.py | 0 {sky/serve/examples => examples/serve}/http_server/task.yaml | 2 +- {sky/serve/examples => examples/serve}/llama2/chat.py | 0 {sky/serve/examples => examples/serve}/llama2/llama2.yaml | 0 {sky/serve/examples => examples/serve}/misc/cancel/README.md | 4 ++-- .../serve}/misc/cancel/send_cancel_request.py | 0 {sky/serve/examples => examples/serve}/misc/cancel/server.py | 0 .../examples => examples/serve}/misc/cancel/service.yaml | 2 +- .../examples => examples/serve}/stable_diffusion_service.yaml | 0 {sky/serve/examples => examples/serve}/tgi_coder.yaml | 0 {sky/serve/examples => examples/serve}/vicuna-v1.5.yaml | 0 {sky/serve/examples => examples/serve}/vllm.yaml | 0 tests/skyserve/auto_restart.yaml | 2 +- tests/skyserve/http/aws.yaml | 2 +- tests/skyserve/http/azure.yaml | 2 +- tests/skyserve/http/gcp.yaml | 2 +- tests/skyserve/http/mixed_cloud.yaml | 2 +- 21 files changed, 9 insertions(+), 9 deletions(-) rename {sky/serve/examples => examples/serve}/gorilla/gorilla.yaml (100%) rename {sky/serve/examples => examples/serve}/gorilla/run_gorilla.py (100%) rename {sky/serve/examples => examples/serve}/hello_skyserve/hello_skyserve.yaml (100%) rename {sky/serve/examples => examples/serve}/hello_skyserve/index.html (100%) rename {sky/serve/examples => examples/serve}/http_server/server.py (100%) rename {sky/serve/examples => examples/serve}/http_server/task.yaml (79%) rename {sky/serve/examples => examples/serve}/llama2/chat.py (100%) rename {sky/serve/examples => examples/serve}/llama2/llama2.yaml (100%) rename {sky/serve/examples => examples/serve}/misc/cancel/README.md (82%) rename {sky/serve/examples => examples/serve}/misc/cancel/send_cancel_request.py (100%) rename {sky/serve/examples => examples/serve}/misc/cancel/server.py (100%) rename {sky/serve/examples => examples/serve}/misc/cancel/service.yaml (82%) rename {sky/serve/examples => examples/serve}/stable_diffusion_service.yaml (100%) rename {sky/serve/examples => examples/serve}/tgi_coder.yaml (100%) rename {sky/serve/examples => examples/serve}/vicuna-v1.5.yaml (100%) rename {sky/serve/examples => examples/serve}/vllm.yaml (100%) diff --git a/sky/serve/examples/gorilla/gorilla.yaml b/examples/serve/gorilla/gorilla.yaml similarity index 100% rename from sky/serve/examples/gorilla/gorilla.yaml rename to examples/serve/gorilla/gorilla.yaml diff --git a/sky/serve/examples/gorilla/run_gorilla.py b/examples/serve/gorilla/run_gorilla.py similarity index 100% rename from sky/serve/examples/gorilla/run_gorilla.py rename to examples/serve/gorilla/run_gorilla.py diff --git a/sky/serve/examples/hello_skyserve/hello_skyserve.yaml b/examples/serve/hello_skyserve/hello_skyserve.yaml similarity index 100% rename from sky/serve/examples/hello_skyserve/hello_skyserve.yaml rename to examples/serve/hello_skyserve/hello_skyserve.yaml diff --git a/sky/serve/examples/hello_skyserve/index.html b/examples/serve/hello_skyserve/index.html similarity index 100% rename from sky/serve/examples/hello_skyserve/index.html rename to examples/serve/hello_skyserve/index.html diff --git a/sky/serve/examples/http_server/server.py b/examples/serve/http_server/server.py similarity index 100% rename from sky/serve/examples/http_server/server.py rename to examples/serve/http_server/server.py diff --git a/sky/serve/examples/http_server/task.yaml b/examples/serve/http_server/task.yaml similarity index 79% rename from sky/serve/examples/http_server/task.yaml rename to examples/serve/http_server/task.yaml index 965e7a5c39f..7faeec72eaa 100644 --- a/sky/serve/examples/http_server/task.yaml +++ b/examples/serve/http_server/task.yaml @@ -1,7 +1,7 @@ resources: cpus: 2+ -workdir: sky/serve/examples/http_server +workdir: examples/serve/http_server run: python3 server.py diff --git a/sky/serve/examples/llama2/chat.py b/examples/serve/llama2/chat.py similarity index 100% rename from sky/serve/examples/llama2/chat.py rename to examples/serve/llama2/chat.py diff --git a/sky/serve/examples/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml similarity index 100% rename from sky/serve/examples/llama2/llama2.yaml rename to examples/serve/llama2/llama2.yaml diff --git a/sky/serve/examples/misc/cancel/README.md b/examples/serve/misc/cancel/README.md similarity index 82% rename from sky/serve/examples/misc/cancel/README.md rename to examples/serve/misc/cancel/README.md index cc76b6b148c..65b88c2d540 100644 --- a/sky/serve/examples/misc/cancel/README.md +++ b/examples/serve/misc/cancel/README.md @@ -7,7 +7,7 @@ This example demonstrates the redirect support canceling a request. Under skypilot root directory, run the following command: ```bash -sky serve up sky/serve/examples/misc/cancel/service.yaml -n skyserve-cancel-test +sky serve up examples/serve/misc/cancel/service.yaml -n skyserve-cancel-test ``` Use `sky serve status` to monitor the status of the service. When its ready, run @@ -19,7 +19,7 @@ sky serve logs skyserve-cancel-test 1 to monitor the logs of the service. Run ```bash -python3 sky/serve/examples/misc/cancel/send_cancel_request.py +python3 examples/serve/misc/cancel/send_cancel_request.py ``` and enter the endpoint output by `sky serve status`. You should see the following output: diff --git a/sky/serve/examples/misc/cancel/send_cancel_request.py b/examples/serve/misc/cancel/send_cancel_request.py similarity index 100% rename from sky/serve/examples/misc/cancel/send_cancel_request.py rename to examples/serve/misc/cancel/send_cancel_request.py diff --git a/sky/serve/examples/misc/cancel/server.py b/examples/serve/misc/cancel/server.py similarity index 100% rename from sky/serve/examples/misc/cancel/server.py rename to examples/serve/misc/cancel/server.py diff --git a/sky/serve/examples/misc/cancel/service.yaml b/examples/serve/misc/cancel/service.yaml similarity index 82% rename from sky/serve/examples/misc/cancel/service.yaml rename to examples/serve/misc/cancel/service.yaml index 0df1640f290..ab736649f8c 100644 --- a/sky/serve/examples/misc/cancel/service.yaml +++ b/examples/serve/misc/cancel/service.yaml @@ -9,6 +9,6 @@ service: resources: cpus: 2+ -workdir: sky/serve/examples/misc/cancel +workdir: examples/serve/misc/cancel run: python3 server.py --port 9000 diff --git a/sky/serve/examples/stable_diffusion_service.yaml b/examples/serve/stable_diffusion_service.yaml similarity index 100% rename from sky/serve/examples/stable_diffusion_service.yaml rename to examples/serve/stable_diffusion_service.yaml diff --git a/sky/serve/examples/tgi_coder.yaml b/examples/serve/tgi_coder.yaml similarity index 100% rename from sky/serve/examples/tgi_coder.yaml rename to examples/serve/tgi_coder.yaml diff --git a/sky/serve/examples/vicuna-v1.5.yaml b/examples/serve/vicuna-v1.5.yaml similarity index 100% rename from sky/serve/examples/vicuna-v1.5.yaml rename to examples/serve/vicuna-v1.5.yaml diff --git a/sky/serve/examples/vllm.yaml b/examples/serve/vllm.yaml similarity index 100% rename from sky/serve/examples/vllm.yaml rename to examples/serve/vllm.yaml diff --git a/tests/skyserve/auto_restart.yaml b/tests/skyserve/auto_restart.yaml index ba5affe4d2a..baa9495b4bd 100644 --- a/tests/skyserve/auto_restart.yaml +++ b/tests/skyserve/auto_restart.yaml @@ -3,7 +3,7 @@ resources: zone: us-central1-a cpus: 2+ -workdir: sky/serve/examples/http_server +workdir: examples/serve/http_server run: python3 server.py --port 8080 diff --git a/tests/skyserve/http/aws.yaml b/tests/skyserve/http/aws.yaml index 512bd5dc007..4482e28aeca 100644 --- a/tests/skyserve/http/aws.yaml +++ b/tests/skyserve/http/aws.yaml @@ -2,7 +2,7 @@ resources: cloud: aws cpus: 2+ -workdir: sky/serve/examples/http_server +workdir: examples/serve/http_server run: python3 server.py diff --git a/tests/skyserve/http/azure.yaml b/tests/skyserve/http/azure.yaml index 011e57deba5..2b0faf26b7a 100644 --- a/tests/skyserve/http/azure.yaml +++ b/tests/skyserve/http/azure.yaml @@ -2,7 +2,7 @@ resources: cloud: azure cpus: 2+ -workdir: sky/serve/examples/http_server +workdir: examples/serve/http_server # Use 8081 to test jupyterhub service is terminated run: python3 server.py --port 8081 diff --git a/tests/skyserve/http/gcp.yaml b/tests/skyserve/http/gcp.yaml index 08ce7899631..60ad9d26362 100644 --- a/tests/skyserve/http/gcp.yaml +++ b/tests/skyserve/http/gcp.yaml @@ -2,7 +2,7 @@ resources: cloud: gcp cpus: 2+ -workdir: sky/serve/examples/http_server +workdir: examples/serve/http_server # Use 8080 to test jupyter service is terminated run: python3 server.py --port 8080 diff --git a/tests/skyserve/http/mixed_cloud.yaml b/tests/skyserve/http/mixed_cloud.yaml index 81f65ad869a..a4b3bd16bf0 100644 --- a/tests/skyserve/http/mixed_cloud.yaml +++ b/tests/skyserve/http/mixed_cloud.yaml @@ -2,7 +2,7 @@ resources: cloud: gcp cpus: 2+ -workdir: sky/serve/examples/http_server +workdir: examples/serve/http_server run: python3 server.py From 788c497252a28727808000de0255ea8a41d08b45 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sun, 10 Sep 2023 13:54:56 -0700 Subject: [PATCH 050/223] Update sky/cli.py Co-authored-by: Zhanghao Wu --- sky/cli.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d6abf62ae84..402b1d6cf0c 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1753,10 +1753,8 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]): is_skyserve_controller = False for prefix in backend_utils.SKY_RESERVED_CLUSTER_PREFIXES: if cluster_name.startswith(prefix): - is_skyserve_controller = True + skyserve_controllers.append(cluster_record) break - if is_skyserve_controller: - skyserve_controllers.append(cluster_record) else: nonreserved_cluster_records.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( From 9bb4334ff6c8f5fa3177875a01b0b6839032474b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 14:26:37 -0700 Subject: [PATCH 051/223] use _make_task_or_dag_from_entrypoint_with_overrides & minor --- sky/cli.py | 83 +++++++++++---------------------------- sky/serve/service_spec.py | 11 +++--- 2 files changed, 28 insertions(+), 66 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 402b1d6cf0c..3b3fa5d8dec 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1037,6 +1037,9 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]: def _make_task_or_dag_from_entrypoint_with_overrides( entrypoint: List[str], *, + yaml_only: bool = False, + task_only: bool = False, + entrypoint_name: str = 'Task', name: Optional[str] = None, cluster: Optional[str] = None, workdir: Optional[str] = None, @@ -1067,14 +1070,21 @@ def _make_task_or_dag_from_entrypoint_with_overrides( entrypoint: Optional[str] if is_yaml: # Treat entrypoint as a yaml. - click.secho('Task from YAML spec: ', fg='yellow', nl=False) + click.secho(f'{entrypoint_name} from YAML spec: ', + fg='yellow', + nl=False) click.secho(entrypoint, bold=True) else: + if yaml_only: + raise click.UsageError( + f'Expected a yaml file, but got {entrypoint}.') if not entrypoint: entrypoint = None else: # Treat entrypoint as a bash command. - click.secho('Task from command: ', fg='yellow', nl=False) + click.secho(f'{entrypoint_name} from command: ', + fg='yellow', + nl=False) click.secho(entrypoint, bold=True) if onprem_utils.check_local_cloud_args(cloud, cluster, yaml_config): @@ -1097,6 +1107,9 @@ def _make_task_or_dag_from_entrypoint_with_overrides( usage_lib.messages.usage.update_user_task_yaml(entrypoint) dag = dag_utils.load_chain_dag_from_yaml(entrypoint, env_overrides=env) if len(dag.tasks) > 1: + if task_only: + raise click.UsageError( + f'Expected a single task, but got {len(dag.tasks)} tasks.') # When the dag has more than 1 task. It is unclear how to # override the params for the dag. So we just ignore the # override params. @@ -1750,7 +1763,6 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]): if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: reserved_clusters.append(cluster_record) else: - is_skyserve_controller = False for prefix in backend_utils.SKY_RESERVED_CLUSTER_PREFIXES: if cluster_name.startswith(prefix): skyserve_controllers.append(cluster_record) @@ -4003,6 +4015,7 @@ def serve(): @click.argument('entrypoint', required=True, type=str, + nargs=-1, **_get_shell_complete_args(_complete_file_name)) @click.option('--service-name', '-n', @@ -4016,8 +4029,9 @@ def serve(): default=False, required=False, help='Skip confirmation prompt.') +# TODO(tian): Support the task_option overrides for the service. def serve_up( - entrypoint: str, + entrypoint: List[str], service_name: Optional[str], yes: bool, ): @@ -4050,61 +4064,10 @@ def serve_up( with ux_utils.print_exception_no_traceback(): raise RuntimeError(prompt) - shell_splits = shlex.split(entrypoint) - yaml_file_provided = (len(shell_splits) == 1 and - (shell_splits[0].endswith('yaml') or - shell_splits[0].endswith('.yml'))) - if not yaml_file_provided: - raise click.UsageError('ENTRYPOINT must points to a valid YAML file.') - - is_yaml = True - config: Optional[List[Dict[str, Any]]] = None - try: - with open(entrypoint, 'r') as f: - try: - config = list(yaml.safe_load_all(f)) - if config: - # FIXME(zongheng): in a chain DAG YAML it only returns the - # first section. OK for downstream but is weird. - result = config[0] - else: - result = {} - if isinstance(result, str): - invalid_reason = ( - 'cannot be parsed into a valid YAML file. ' - 'Please check syntax.') - is_yaml = False - except yaml.YAMLError as e: - if yaml_file_provided: - logger.debug(e) - invalid_reason = ('contains an invalid configuration. ' - ' Please check syntax.') - is_yaml = False - except OSError: - entry_point_path = os.path.expanduser(entrypoint) - if not os.path.exists(entry_point_path): - invalid_reason = ('does not exist. Please check if the path' - ' is correct.') - elif not os.path.isfile(entry_point_path): - invalid_reason = ('is not a file. Please check if the path' - ' is correct.') - else: - invalid_reason = ('yaml.safe_load() failed. Please check if the' - ' path is correct.') - is_yaml = False - if not is_yaml: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'{entrypoint!r} looks like a yaml path but {invalid_reason}') + task = _make_task_or_dag_from_entrypoint_with_overrides( + entrypoint, yaml_only=True, task_only=True, entrypoint_name='Service') + assert isinstance(task, sky.Task) - click.secho('Service from YAML spec: ', fg='yellow', nl=False) - click.secho(entrypoint, bold=True) - usage_lib.messages.usage.update_user_task_yaml(entrypoint) - dag = dag_utils.load_chain_dag_from_yaml(entrypoint) - if len(dag.tasks) > 1: - with ux_utils.print_exception_no_traceback(): - raise ValueError('Multiple tasks found in the YAML file.') - task: sky.Task = dag.tasks[0] if task.service is None: with ux_utils.print_exception_no_traceback(): raise ValueError('Service section not found in the YAML file.') @@ -4118,10 +4081,10 @@ def serve_up( app_port = int(task.service.app_port) task.set_resources(requested_resources.copy(ports=[app_port])) - click.secho('Service Spec:', fg='cyan') + click.secho('\nService Spec:', fg='cyan') click.echo(task.service) - click.secho('Each replica will use the following resource (estimated):', + click.secho('Each replica will use the following resources (estimated):', fg='cyan') with sky.Dag() as dag: dag.add(task) diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 128ac1e34ab..7a26d240494 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -193,12 +193,11 @@ def policy_str(self): def __repr__(self) -> str: return textwrap.dedent(f"""\ - Readiness probe method: {self.probe_str()} - Readiness initial delay seconds: {self.initial_delay_seconds} - Replica autoscaling policy: {self.policy_str()} - Replica auto restart: {self.auto_restart} - - Please refer to SkyPilot Serve document for detailed explanations. + Readiness probe method: {self.probe_str()} + Readiness initial delay seconds: {self.initial_delay_seconds} + Replica autoscaling policy: {self.policy_str()} + Replica auto restart: {self.auto_restart} + Please refer to SkyServe document for detailed explanations. """) @property From bdef68289dd38c2d0d4c2c501fc565f008d7f419 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 15:29:01 -0700 Subject: [PATCH 052/223] make sky serve status accept multiple service names --- sky/__init__.py | 1 + sky/backends/backend_utils.py | 37 +++++++++++++++++++++++++---------- sky/cli.py | 17 ++++++++-------- sky/core.py | 34 ++++++++++++++++++++++++++++++-- 4 files changed, 69 insertions(+), 20 deletions(-) diff --git a/sky/__init__.py b/sky/__init__.py index 15a0f69128d..8b8c22e3771 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -41,6 +41,7 @@ from sky.resources import Resources from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus +from sky.status_lib import ServiceStatus from sky.task import Task # Aliases. diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index bca4b7ac428..b1b1e470191 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2743,13 +2743,31 @@ def _refresh_service_record( # TODO(tian): Maybe aggregate services using same controller to reduce SSH # overhead? -def refresh_service_status(service_name: Optional[str]) -> List[Dict[str, Any]]: - if service_name is None: - service_names = [ - record['name'] for record in global_user_state.get_services() - ] - else: - service_names = [service_name] +def refresh_service_status( + service_names: Optional[Union[str, List[str]]]) -> List[Dict[str, Any]]: + yellow = colorama.Fore.YELLOW + bright = colorama.Style.BRIGHT + reset = colorama.Style.RESET_ALL + + records = global_user_state.get_services() + if service_names is not None: + if isinstance(service_names, str): + service_names = [service_names] + new_records = [] + not_exist_service_names = [] + for service_name in service_names: + for record in records: + if record['name'] == service_name: + new_records.append(record) + break + else: + not_exist_service_names.append(service_name) + if not_exist_service_names: + services_str = ', '.join(not_exist_service_names) + logger.info(f'Service(s) not found: {bright}{services_str}{reset}.') + records = new_records + + service_names = [record['name'] for record in records] plural = 's' if len(service_names) > 1 else '' progress = rich_progress.Progress(transient=True, @@ -2764,9 +2782,8 @@ def _refresh_service(service_name: str) -> Optional[Dict[str, Any]]: record, msg = _refresh_service_record(service_name) if msg is not None: progress.stop() - print( - f'{colorama.Fore.YELLOW}Error occurred when refreshing service ' - f'{service_name}: {msg}{colorama.Style.RESET_ALL}') + print(f'{yellow}Error occurred when refreshing service ' + f'{service_name}: {msg}{reset}') progress.start() progress.update(task, advance=1) return record diff --git a/sky/cli.py b/sky/cli.py index 3b3fa5d8dec..251a0074f39 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -147,7 +147,7 @@ def _get_glob_services(service_names: List[str]) -> List[str]: glob_service_name = global_user_state.get_glob_service_names( service_name) if not glob_service_name: - click.echo(f'Service {service_name!r} not found.') + click.echo(f'Service {service_name} not found.') glob_service_names.extend(glob_service_name) return list(set(glob_service_names)) @@ -4106,13 +4106,14 @@ def serve_up( is_flag=True, required=False, help='Show all information in full.') -@click.argument('service_name', +@click.argument('service_names', required=False, type=str, + nargs=-1, **_get_shell_complete_args(_complete_service_name)) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def serve_status(all: bool, service_name: Optional[str]): +def serve_status(all: bool, service_names: List[str]): """Show statuses of SkyServe service. Show detailed statuses of the service. If SERVICE_NAME is not provided, @@ -4187,12 +4188,11 @@ def serve_status(all: bool, service_name: Optional[str]): # Only show status of my-service sky serve status my-service """ - service_records = core.serve_status(service_name) - if service_name is not None and not service_records: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name!r} not found.') click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' f'{colorama.Style.RESET_ALL}') + if service_names: + service_names = _get_glob_services(service_names) + service_records = core.serve_status(service_names) status_utils.show_service_table(service_records, all) click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Replicas{colorama.Style.RESET_ALL}') @@ -4206,8 +4206,9 @@ def serve_status(all: bool, service_name: Optional[str]): @serve.command('down', cls=_DocumentedCodeCommand) @click.argument('service_names', - nargs=-1, required=False, + type=str, + nargs=-1, **_get_shell_complete_args(_complete_service_name)) @click.option('--all', '-a', diff --git a/sky/core.py b/sky/core.py index 4682cbe0e47..bd288405ff2 100644 --- a/sky/core.py +++ b/sky/core.py @@ -989,8 +989,38 @@ def storage_delete(name: str) -> None: @usage_lib.entrypoint -def serve_status(service_name: Optional[str]) -> List[Dict[str, Any]]: - return backend_utils.refresh_service_status(service_name) +def serve_status( + service_names: Optional[Union[str, + List[str]]] = None) -> List[Dict[str, Any]]: + """Get service statuses. + + If service_names is given, return those services. Otherwise, return all + services. + + Each returned value has the following fields: + + .. code-block:: python + + { + 'name': (str) service name, + 'launched_at': (int) timestamp of creation, + 'controller_name': (str) name of the controller cluster of the + service, + 'handle': (serve.ServiceHandle) handle of the service, + 'status': (sky.ServiceStatus) service status, + } + + For possible service statuses, please refer to sky.cli.serve_status. + + Args: + service_names: a list of service names to query. If None, query all + services. + + Returns: + A list of dicts, with each dict containing the information of a service. + If a service is not found, it will be omitted from the returned list. + """ + return backend_utils.refresh_service_status(service_names) @usage_lib.entrypoint From 96419ae471c91dbe9c82a27fac728411eccbc56a Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 15:30:58 -0700 Subject: [PATCH 053/223] minor --- sky/core.py | 2 +- sky/execution.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/core.py b/sky/core.py index bd288405ff2..f6db12c2114 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1083,7 +1083,7 @@ def serve_tail_logs(service_name: str, def serve_down( service_name: str, purge: bool, -): +) -> None: """Teardown a service. Please refer to the sky.cli.serve_down for the document. diff --git a/sky/execution.py b/sky/execution.py index abba9b31852..6e58416ca2a 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -975,7 +975,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): def serve_up( task: 'sky.Task', service_name: str, -): +) -> None: """Spin up a service. Please refer to the sky.cli.serve_up for the document. From d0a3fb820e11f7895013533e00a630c3bd29c7e7 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 15:41:48 -0700 Subject: [PATCH 054/223] minor --- sky/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/__init__.py b/sky/__init__.py index 8b8c22e3771..700c7171087 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -87,7 +87,6 @@ 'exec', 'spot_launch', 'serve_up', - 'serve_down', # core APIs 'status', 'start', @@ -108,4 +107,6 @@ # core APIs Storage Management 'storage_ls', 'storage_delete', + # core APIs Serve Management + 'serve_down', ] From 1eefc54874eedfeaf8d5a2c43d944bf0edaeeb3b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 15:58:22 -0700 Subject: [PATCH 055/223] upd docstring --- sky/__init__.py | 1 + sky/core.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sky/__init__.py b/sky/__init__.py index 700c7171087..0b0a71d7907 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -41,6 +41,7 @@ from sky.resources import Resources from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus +from sky.status_lib import ReplicaStatus from sky.status_lib import ServiceStatus from sky.task import Task diff --git a/sky/core.py b/sky/core.py index f6db12c2114..55a0cbe2678 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1008,9 +1008,22 @@ def serve_status( service, 'handle': (serve.ServiceHandle) handle of the service, 'status': (sky.ServiceStatus) service status, + 'replica_info': (List[Dict[str, Any]]) replica information, } - For possible service statuses, please refer to sky.cli.serve_status. + Each entry in replica_info has the following fields: + + .. code-block:: python + + { + 'replica_id': (int) replica id, + 'name': (str) replica name, + 'status': (sky.ReplicaStatus) replica status, + 'handle': (ResourceHandle) handle of the replica cluster, + } + + For possible service statuses and replica statuses, please refer to + sky.cli.serve_status. Args: service_names: a list of service names to query. If None, query all From 457c7a2fe98cf056463a42f4262e7faa2620d025 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 10 Sep 2023 16:32:54 -0700 Subject: [PATCH 056/223] fix --- sky/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 251a0074f39..a57188a33db 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4190,9 +4190,10 @@ def serve_status(all: bool, service_names: List[str]): """ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' f'{colorama.Style.RESET_ALL}') + query_services: Optional[List[str]] = None if service_names: - service_names = _get_glob_services(service_names) - service_records = core.serve_status(service_names) + query_services = _get_glob_services(service_names) + service_records = core.serve_status(query_services) status_utils.show_service_table(service_records, all) click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Replicas{colorama.Style.RESET_ALL}') From 9a2d27e3825f74b48f590fed523dbc9583b0478b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 11 Sep 2023 11:13:27 -0700 Subject: [PATCH 057/223] better programmatic api --- sky/cli.py | 6 +++--- sky/core.py | 49 +++++++++++++++++++++++++++--------------------- sky/execution.py | 8 ++++++-- 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index a57188a33db..75aaabccd4b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4371,9 +4371,9 @@ def serve_logs( 'One and only one of --controller, --load-balancer, ' '[REPLICA_ID] can be specified.') core.serve_tail_logs(service_name, - controller, - load_balancer, - replica_id, + controller=controller, + load_balancer=load_balancer, + replica_id=replica_id, follow=follow) diff --git a/sky/core.py b/sky/core.py index 55a0cbe2678..dd56870e973 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1037,11 +1037,14 @@ def serve_status( @usage_lib.entrypoint -def serve_tail_logs(service_name: str, - controller: bool = False, - load_balancer: bool = False, - replica_id: Optional[int] = None, - follow: bool = True) -> None: +def serve_tail_logs( + service_name: str, + *, + controller: bool = False, + load_balancer: bool = False, + replica_id: Optional[int] = None, + follow: bool = True, +) -> None: """Tail logs for a service. Usage: @@ -1080,7 +1083,9 @@ def serve_tail_logs(service_name: str, controller_name = service_record['controller_name'] handle = global_user_state.get_handle_from_cluster_name(controller_name) if handle is None: - raise ValueError(f'Cannot find controller for service {service_name}.') + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Cannot find controller for service {service_name}.') assert isinstance(handle, backends.CloudVmRayResourceHandle), handle backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend), backend @@ -1093,10 +1098,7 @@ def serve_tail_logs(service_name: str, @usage_lib.entrypoint -def serve_down( - service_name: str, - purge: bool, -) -> None: +def serve_down(service_name: str, purge: bool = False) -> None: """Teardown a service. Please refer to the sky.cli.serve_down for the document. @@ -1124,8 +1126,8 @@ def serve_down( if service_handle.controller_port is None: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - f'Controller job of service {service_name!r} not found.' - ) + f'Controller job of service {service_name!r} ' + 'not found.') code = serve.ServeCodeGen.terminate_service( service_handle.controller_port) @@ -1145,15 +1147,18 @@ def serve_down( resp = serve.load_terminate_service_result( terminate_service_payload) if resp.status_code != 200: - raise RuntimeError('Failed to terminate replica of service ' - f'{service_name!r} due to request ' - f'failure: {resp.text}') + with ux_utils.print_exception_no_traceback(): + raise RuntimeError('Failed to terminate replica of service ' + f'{service_name!r} due to request ' + f'failure: {resp.text}') msg = resp.json()['message'] if msg: - raise RuntimeError( - 'Unexpected message when tearing down replica of service ' - f'{service_name!r}: {msg}. Please login to the controller ' - 'and make sure the service is properly cleaned.') + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'Unexpected message when tearing down replica of ' + f'service {service_name!r}: {msg}. Please login to ' + 'the controller and make sure the service is properly ' + 'cleaned up.') # We want to make sure no matter what error happens, we can still # clean up the record if purge is True. @@ -1162,7 +1167,8 @@ def serve_down( logger.warning('Ignoring error when cleaning replicas of ' f'{service_name!r}: {e}') else: - raise RuntimeError(e) from e + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(e) from e else: if not purge: with ux_utils.print_exception_no_traceback(): @@ -1205,7 +1211,8 @@ def serve_down( 'Ignoring error when stopping controller and ' f'load balancer jobs of service {service_name!r}: {e}') else: - raise RuntimeError(e) from e + with ux_utils.print_exception_no_traceback(): + raise RuntimeError(e) from e # TODO(tian): Maybe add a post_cleanup function? controller_yaml_path = serve.generate_controller_yaml_file_name( diff --git a/sky/execution.py b/sky/execution.py index 6e58416ca2a..16146b3beb8 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -974,7 +974,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): @usage_lib.entrypoint def serve_up( task: 'sky.Task', - service_name: str, + service_name: Optional[str] = None, ) -> None: """Spin up a service. @@ -984,6 +984,9 @@ def serve_up( task: sky.Task to serve up. service_name: Name of the service. """ + if service_name is None: + service_name = backend_utils.generate_service_name() + if re.fullmatch(serve.SERVICE_NAME_VALID_REGEX, service_name) is None: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service name {service_name!r} is invalid: ' @@ -997,7 +1000,8 @@ def serve_up( 'taken. Please use a different name.') if task.service is None: - raise RuntimeError('Service section not found.') + with ux_utils.print_exception_no_traceback(): + raise RuntimeError('Service section not found.') controller_resources_config: Dict[str, Any] = copy.copy( serve.CONTROLLER_RESOURCES) if task.service.controller_resources is not None: From f4bdbdba5e4e04f2cb50a401641c9a3371d83409 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 12 Sep 2023 16:25:48 -0700 Subject: [PATCH 058/223] ux --- sky/backends/backend_utils.py | 14 ++++++++------ sky/cli.py | 11 +++++++++++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index b1b1e470191..37511eeb155 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2682,27 +2682,29 @@ def _refresh_service_record_no_lock( if cluster_record is None: global_user_state.set_service_status( service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - return record, (f'Controller cluster {controller_name!r} ' - 'is not found.') + return record, None handle = cluster_record['handle'] backend = get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) if service_handle.controller_port is None: - return record, 'Controller task is not successfully launched.' + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) + return record, None code = serve_lib.ServeCodeGen.get_latest_info( service_handle.controller_port) - returncode, latest_info_payload, stderr = backend.run_on_head( + returncode, latest_info_payload, _ = backend.run_on_head( handle, code, require_outputs=True, stream_logs=False, separate_stderr=True) if returncode != 0: - return record, ('Failed to refresh replica info from the controller. ' - f'Using the cached record. Reason: {stderr}') + global_user_state.set_service_status( + service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) + return record, None latest_info = serve_lib.load_latest_info(latest_info_payload) service_handle.uptime = latest_info['uptime'] diff --git a/sky/cli.py b/sky/cli.py index 75aaabccd4b..770e2f4b848 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4204,6 +4204,17 @@ def serve_status(all: bool, service_names: List[str]): replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) + failed_controllers = [ + record['name'] + for record in service_records + if record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED + ] + if failed_controllers: + num_failed = len(failed_controllers) + plural = '' if num_failed == 1 else 's' + click.echo(f'\n* {num_failed} service{plural} with failed controller ' + 'found. The replica info and number might not be accurate.') + @serve.command('down', cls=_DocumentedCodeCommand) @click.argument('service_names', From 78d73423242857b834f9d1ebd1ad741fa24047c0 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 13 Sep 2023 13:29:50 -0700 Subject: [PATCH 059/223] use flag to control logging --- sky/backends/cloud_vm_ray_backend.py | 19 +++++++++---------- sky/core.py | 3 ++- sky/execution.py | 22 +++++++++++----------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index dd13b1c5274..8dd0b0b1264 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2593,6 +2593,7 @@ def __init__(self): self._dag = None self._optimize_target = None self._requested_features = set() + self._minimize_logging = False # Command for running the setup script. It is only set when the # setup needs to be run outside the self._setup() and as part of @@ -2608,6 +2609,8 @@ def register_info(self, **kwargs) -> None: self._optimize_target) or optimizer.OptimizeTarget.COST self._requested_features = kwargs.pop('requested_features', self._requested_features) + self._minimize_logging = kwargs.pop('minimize_logging', + self._minimize_logging) assert len(kwargs) == 0, f'Unexpected kwargs: {kwargs}' def check_resources_fit_cluster(self, handle: CloudVmRayResourceHandle, @@ -3226,7 +3229,7 @@ def _exec_code_on_head( f'Failed to submit job {job_id}.', stderr=stdout + stderr) - if not handle.cluster_name.startswith(serve_lib.CONTROLLER_PREFIX): + if not self._minimize_logging: logger.info('Job submitted with Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}') @@ -3259,9 +3262,7 @@ def _exec_code_on_head( '\nTo view the spot job dashboard:\t' f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') - elif not name.startswith(serve_lib.CONTROLLER_PREFIX): - # Skip logging for submit controller & load balancer jobs - # to skyserve controller cluster + elif not self._minimize_logging: logger.info(f'{fore.CYAN}Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' '\nTo cancel the job:\t' @@ -3384,8 +3385,7 @@ def _post_execute(self, handle: CloudVmRayResourceHandle, fore = colorama.Fore style = colorama.Style name = handle.cluster_name - if (name == spot_lib.SPOT_CONTROLLER_NAME or down or - name.startswith(serve_lib.CONTROLLER_PREFIX)): + if down or self._minimize_logging: return stop_str = ('\nTo stop the cluster:' f'\t{backend_utils.BOLD}sky stop {name}' @@ -3503,8 +3503,7 @@ def get_job_status( def cancel_jobs(self, handle: CloudVmRayResourceHandle, jobs: Optional[List[int]], - cancel_all: bool = False, - silent: bool = False) -> None: + cancel_all: bool = False) -> None: """Cancels jobs. CloudVMRayBackend specific method. @@ -3537,7 +3536,7 @@ def cancel_jobs(self, cancelled_ids = common_utils.decode_payload(stdout) - if silent: + if self._minimize_logging: return if cancelled_ids: @@ -4284,7 +4283,7 @@ def _check_existing_cluster( f'{cluster_name!r} [Username: {ssh_user}].' f'{colorama.Style.RESET_ALL}\n' 'Run `sky status` to see existing clusters.') - elif not cluster_name.startswith(serve_lib.CONTROLLER_PREFIX): + elif not self._minimize_logging: logger.info( f'{colorama.Fore.CYAN}Creating a new cluster: "{cluster_name}" ' f'[{task.num_nodes}x {to_provision}].' diff --git a/sky/core.py b/sky/core.py index dd56870e973..1513b0bd596 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1179,13 +1179,14 @@ def serve_down(service_name: str, purge: bool = False) -> None: if handle is not None: assert isinstance(handle, backends.CloudVmRayResourceHandle) backend = backends.CloudVmRayBackend() + backend.register_info(minimize_logging=True) # Cancel the controller and load balancer jobs. # For the case when controller / load_balancer job failed to submit. jobs = [] if service_handle.job_id is not None: jobs.append(service_handle.job_id) - backend.cancel_jobs(handle, jobs=jobs, silent=True) + backend.cancel_jobs(handle, jobs=jobs) # Cleanup all files on controller related to this service. # We have a 10-min grace period for the controller to autostop, diff --git a/sky/execution.py b/sky/execution.py index 16146b3beb8..382c3c75ab1 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -180,6 +180,7 @@ def _execute( idle_minutes_to_autostop: Optional[int] = None, no_setup: bool = False, clone_disk_from: Optional[str] = None, + minimize_logging: bool = False, # Internal only: # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, @@ -323,7 +324,8 @@ def _execute( backend.register_info(dag=dag, optimize_target=optimize_target, - requested_features=requested_features) + requested_features=requested_features, + minimize_logging=minimize_logging) if task.storage_mounts is not None: # Optimizer should eventually choose where to store bucket @@ -380,26 +382,21 @@ def _execute( backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: - if (cluster_name != spot.SPOT_CONTROLLER_NAME and - cluster_name is not None and - not cluster_name.startswith(serve.CONTROLLER_PREFIX)): + if not minimize_logging: # UX: print live clusters to make users aware (to save costs). # # Don't print if this job is launched by the spot controller, # because spot jobs are serverless, there can be many of them, and # users tend to continuously monitor spot jobs using `sky spot - # status`. + # status`. Also don't print if this job is a skyserve controller + # job. # # Disable the usage collection for this status command. env = dict(os.environ, **{env_options.Options.DISABLE_LOGGING.value: '1'}) subprocess_utils.run('sky status --no-show-spot-jobs', env=env) - # UX: Don't show cursor if we are initializing a skyserve controller, - # since it will mess up the progress bar. - if (cluster_name is None or - not cluster_name.startswith(serve.CONTROLLER_PREFIX)): - print() - print('\x1b[?25h', end='') # Show cursor. + print() + print('\x1b[?25h', end='') # Show cursor. return job_id @@ -798,6 +795,7 @@ def spot_launch( idle_minutes_to_autostop=spot. SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, + minimize_logging=True, ) @@ -1126,6 +1124,7 @@ def serve_up( # value and a previous controller could be reused. idle_minutes_to_autostop=serve.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, + minimize_logging=True, ) controller_record = global_user_state.get_cluster_from_name( @@ -1140,6 +1139,7 @@ def serve_up( assert isinstance(handle, backends.CloudVmRayResourceHandle) backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend), backend + backend.register_info(minimize_logging=True) service_handle.endpoint_ip = handle.head_ip global_user_state.set_service_handle(service_name, service_handle) From baf62f9e006345e58be1cae82a0b93573fa4c667 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 13 Sep 2023 17:09:28 -0700 Subject: [PATCH 060/223] combine reserved prefix & name --- sky/backends/backend_utils.py | 77 ++++++++++++++------ sky/cli.py | 131 +++++++++++++--------------------- sky/core.py | 4 +- 3 files changed, 105 insertions(+), 107 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 37511eeb155..b5e8f5e358a 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,5 +1,6 @@ """Util constants/functions for the backends.""" import copy +import dataclasses from datetime import datetime import difflib import enum @@ -14,7 +15,8 @@ import textwrap import time import typing -from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union +from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, + Union) import uuid import colorama @@ -101,18 +103,52 @@ # Note: This value cannot be too small, otherwise OOM issue may occur. DEFAULT_TASK_CPU_DEMAND = 0.5 -# Mapping from reserved cluster names to the corresponding group name (logging -# purpose). -# NOTE: each group can only have one reserved cluster name for now. -SKY_RESERVED_CLUSTER_NAMES: Dict[str, str] = { - spot_lib.SPOT_CONTROLLER_NAME: 'Managed spot controller' -} -# Mapping from reserved cluster prefixes to the corresponding group name -# (logging purpose). -SKY_RESERVED_CLUSTER_PREFIXES: Dict[str, str] = { - serve_lib.CONTROLLER_PREFIX: 'SkyServe controller', -} +@dataclasses.dataclass +class ReservedClusterRecord: + """Record for reserved cluster group.""" + group_name: str + check: Callable[[str], bool] + sky_status_hint: str + decline_stop_hint: str + decline_cancel_hint: str + check_cluster_name_hint: str + + +class ReservedClusterGroup(enum.Enum): + """Reserved cluster groups for skypilot.""" + # NOTE(dev): Keep this align with + # sky/cli.py::_RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE + SPOT_CONTROLLER = ReservedClusterRecord( + group_name='Managed spot controller', + check=lambda name: name == spot_lib.SPOT_CONTROLLER_NAME, + sky_status_hint=( + f'* To see detailed spot job status: {colorama.Style.BRIGHT}' + f'sky spot queue{colorama.Style.RESET_ALL}'), + decline_stop_hint=('Spot controller will be auto-stopped after all ' + 'spot jobs finish.'), + check_cluster_name_hint=( + f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' + 'managed spot controller. ')) + SKY_SERVE_CONTROLLER = ReservedClusterRecord( + group_name='Sky Serve controller', + check=lambda name: name.startswith(serve_lib.CONTROLLER_PREFIX), + sky_status_hint=( + f'* To see detailed service status: {colorama.Style.BRIGHT}' + f'sky serve status{colorama.Style.RESET_ALL}'), + decline_stop_hint=(f'To teardown a service, use {colorama.Style.BRIGHT}' + f'sky serve down{colorama.Style.RESET_ALL}.'), + check_cluster_name_hint=( + f'Cluster prefix {serve_lib.CONTROLLER_PREFIX} is reserved for ' + 'sky serve controller. ')) + + @classmethod + def get_group(cls, name: str) -> Optional['ReservedClusterGroup']: + for group in cls: + if group.value.check(name): + return group + return None + # Filelocks for the cluster status change. CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock') @@ -2534,7 +2570,7 @@ def get_clusters( if not include_reserved: records = [ record for record in records - if record['name'] not in SKY_RESERVED_CLUSTER_NAMES + if ReservedClusterGroup.get_group(record['name']) is None ] yellow = colorama.Fore.YELLOW @@ -2875,16 +2911,11 @@ def check_cluster_name_not_reserved( Returns: None, if the cluster name is not reserved. """ - msg = None - if cluster_name in SKY_RESERVED_CLUSTER_NAMES: - msg = (f'Cluster {cluster_name!r} is reserved for the ' - f'{SKY_RESERVED_CLUSTER_NAMES[cluster_name].lower()}.') - for prefix in SKY_RESERVED_CLUSTER_PREFIXES: - if cluster_name is not None and cluster_name.startswith(prefix): - msg = (f'Cluster prefix {prefix!r} is reserved for the ' - f'{SKY_RESERVED_CLUSTER_PREFIXES[prefix].lower()}.') - break - if msg is not None: + if cluster_name is None: + return + group = ReservedClusterGroup.get_group(cluster_name) + if group is not None: + msg = group.value.check_cluster_name_hint if operation_str is not None: msg += f' {operation_str} is not allowed.' with ux_utils.print_exception_no_traceback(): diff --git a/sky/cli.py b/sky/cli.py index 770e2f4b848..4ce6159fb79 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1754,38 +1754,30 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]): query_clusters = _get_glob_clusters(clusters) cluster_records = core.status(cluster_names=query_clusters, refresh=refresh) + hints = [] nonreserved_cluster_records = [] reserved_clusters = [] - # TODO(tian): Rename this variable if other reserved prefix are added. - skyserve_controllers = [] for cluster_record in cluster_records: cluster_name = cluster_record['name'] - if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: + group = backend_utils.ReservedClusterGroup.get_group(cluster_name) + if group is not None: reserved_clusters.append(cluster_record) + hints.append(group.value.sky_status_hint) else: - for prefix in backend_utils.SKY_RESERVED_CLUSTER_PREFIXES: - if cluster_name.startswith(prefix): - skyserve_controllers.append(cluster_record) - break - else: - nonreserved_cluster_records.append(cluster_record) + nonreserved_cluster_records.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( suppress_error=True) num_pending_autostop = 0 num_pending_autostop += status_utils.show_status_table( - nonreserved_cluster_records + reserved_clusters, all) + nonreserved_cluster_records, all) status_utils.show_local_status_table(local_clusters) - hints = [] - if skyserve_controllers: + if reserved_clusters: click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}\n' - f'SkyServe Controllers{colorama.Style.RESET_ALL}') + f'Controllers{colorama.Style.RESET_ALL}') num_pending_autostop += status_utils.show_status_table( - skyserve_controllers, all) - hints.append( - f'* To see detailed service status: {colorama.Style.BRIGHT}' - f'sky serve status{colorama.Style.RESET_ALL}') + reserved_clusters, all) if show_spot_jobs: click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' @@ -1879,9 +1871,9 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin reserved_clusters = dict() for cluster_record in cluster_records: cluster_name = cluster_record['name'] - if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: - cluster_group_name = backend_utils.SKY_RESERVED_CLUSTER_NAMES[ - cluster_name] + group = backend_utils.ReservedClusterGroup.get_group(cluster_name) + if group is not None: + cluster_group_name = group.value.group_name # to display most recent entry for each reserved cluster # TODO(sgurram): fix assumption of sorted order of clusters if cluster_group_name not in reserved_clusters: @@ -2442,7 +2434,8 @@ def start( clusters = [ cluster['name'] for cluster in global_user_state.get_clusters() - if cluster['name'] not in backend_utils.SKY_RESERVED_CLUSTER_NAMES + if backend_utils.ReservedClusterGroup.get_group(cluster['name']) is + None ] if not clusters: @@ -2510,26 +2503,24 @@ def start( # Checks for reserved clusters (spot controller). reserved, non_reserved = [], [] for name in to_start: - if name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: + if backend_utils.ReservedClusterGroup.get_group(name) is not None: reserved.append(name) else: non_reserved.append(name) if reserved and non_reserved: - assert len(reserved) == 1, reserved # Keep this behavior the same as _down_or_stop_clusters(). raise click.UsageError( - 'Starting the spot controller with other cluster(s) ' + 'Starting skypilot controllers with other cluster(s) ' 'is currently not supported.\n' 'Please start the former independently.') if reserved: - assert len(reserved) == 1, reserved bold = backend_utils.BOLD reset_bold = backend_utils.RESET_BOLD if idle_minutes_to_autostop is not None: raise click.UsageError( 'Autostop options are currently not allowed when starting the ' - 'spot controller. Use the default autostop settings by directly' - f' calling: {bold}sky start {reserved[0]}{reset_bold}') + 'controllers. Use the default autostop settings by directly ' + f'calling: {bold}sky start {" ".join(reserved)}{reset_bold}') if not yes: cluster_str = 'clusters' if len(to_start) > 1 else 'cluster' @@ -2698,10 +2689,18 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): f'the service{plural} first with {colorama.Style.BRIGHT}sky ' f'serve down {" ".join(service_names)}' f'{colorama.Style.RESET_ALL}.') - msg = (f'Tearing down sky serve controller: {controller_name}.') + msg = f'Tearing down sky serve controller: {controller_name}.' click.echo(msg) +_RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE = { + backend_utils.ReservedClusterGroup.SPOT_CONTROLLER: + (_hint_or_raise_for_down_spot_controller), + backend_utils.ReservedClusterGroup.SKY_SERVE_CONTROLLER: + (_hint_or_raise_for_down_sky_serve_controller), +} + + def _down_or_stop_clusters( names: List[str], apply_to_all: Optional[bool], @@ -2711,9 +2710,9 @@ def _down_or_stop_clusters( idle_minutes_to_autostop: Optional[int] = None) -> None: """Tears down or (auto-)stops a cluster (or all clusters). - Reserved clusters (spot controller) can only be terminated if the cluster - name is explicitly and uniquely specified (not via glob) and purge is set - to True. + Reserved clusters (spot controller and sky serve controller) can only be + terminated if the cluster name is explicitly and uniquely specified (not + via glob) and purge is set to True. """ if down: command = 'down' @@ -2745,12 +2744,12 @@ def _down_or_stop_clusters( if len(names) > 0: reserved_clusters = [ name for name in names - if name in backend_utils.SKY_RESERVED_CLUSTER_NAMES + if backend_utils.ReservedClusterGroup.get_group(name) is not None ] reserved_clusters_str = ', '.join(map(repr, reserved_clusters)) names = [ name for name in _get_glob_clusters(names) - if name not in backend_utils.SKY_RESERVED_CLUSTER_NAMES + if backend_utils.ReservedClusterGroup.get_group(name) is None ] if not down: local_clusters = onprem_utils.check_and_get_local_clusters() @@ -2763,50 +2762,19 @@ def _down_or_stop_clusters( f'Skipping local cluster {c}, as it does not support ' '`sky stop/autostop`.')) ] - name_to_reserved_prefix = dict() - for name in names: - for prefix in backend_utils.SKY_RESERVED_CLUSTER_PREFIXES: - if name.startswith(prefix): - name_to_reserved_prefix[name] = prefix - break - names = [name for name in names if name not in name_to_reserved_prefix] - reserve_prefix_str = ', '.join( - [f'{prefix}*' for prefix in name_to_reserved_prefix.values()]) - if len(name_to_reserved_prefix) > 0: - if len(names) != 0: - names_str = ', '.join(map(repr, names)) - raise click.UsageError( - f'{operation} cluster(s) with reserved prefix ' - f'{reserve_prefix_str} with other cluster(s) ' - f'{names_str} is currently not supported.\n' - 'Please omit the cluster(s) with reserved prefix ' - f'{name_to_reserved_prefix.values()}.') - if not down: - raise click.UsageError( - f'{operation} cluster(s) with reserved prefix ' - f'{reserve_prefix_str} is not supported. To teardown a ' - 'service, please use `sky serve down`.') - else: - if len(name_to_reserved_prefix) > 1: - raise click.UsageError( - f'{operation} multiple clusters with reserved prefix ' - f'{reserve_prefix_str} is currently not supported.\n' - 'Please omit all but one of the clusters.') - # We can only teardown one reserved cluster (sky serve - # controller) for now. - _hint_or_raise_for_down_sky_serve_controller( - list(name_to_reserved_prefix.keys())[0]) - confirm_str = 'delete' - user_input = click.prompt( - f'To proceed, please type {colorama.Style.BRIGHT}' - f'{confirm_str!r}{colorama.Style.RESET_ALL}', - type=str) - if user_input != confirm_str: - raise click.Abort() - no_confirm = True # Make sure the reserved clusters are explicitly specified without other # normal clusters. if len(reserved_clusters) > 0: + name2group: Dict[str, backend_utils.ReservedClusterGroup] = dict() + for name in reserved_clusters: + group = backend_utils.ReservedClusterGroup.get_group(name) + assert group is not None + name2group[name] = group + # Use set to remove duplicated sky serve controller messages. + decline_stop_hints = set() + for group in name2group.values(): + decline_stop_hints.add(group.value.decline_stop_hint) + decline_stop_hints = ' '.join(decline_stop_hints) if len(names) != 0: names_str = ', '.join(map(repr, names)) raise click.UsageError( @@ -2818,12 +2786,12 @@ def _down_or_stop_clusters( raise click.UsageError( f'{operation} reserved cluster(s) ' f'{reserved_clusters_str} is currently not supported. ' - 'It will be auto-stopped after all spot jobs finish.') + f'{decline_stop_hints}') else: - # TODO(zhwu): We can only have one reserved cluster (spot - # controller). - assert len(reserved_clusters) == 1, reserved_clusters - _hint_or_raise_for_down_spot_controller(reserved_clusters[0]) + for reserved_cluster in reserved_clusters: + hint_or_raise = _RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE[ + name2group[reserved_cluster]] + hint_or_raise(reserved_cluster) confirm_str = 'delete' user_input = click.prompt( f'To proceed, please check the warning above and type ' @@ -2845,9 +2813,8 @@ def _down_or_stop_clusters( # Otherwise, it would be very easy to accidentally delete a reserved # cluster. names = [ - record['name'] - for record in all_clusters - if record['name'] not in backend_utils.SKY_RESERVED_CLUSTER_NAMES + record['name'] for record in all_clusters if + backend_utils.ReservedClusterGroup.get_group(record['name']) is None ] clusters = [] diff --git a/sky/core.py b/sky/core.py index 1513b0bd596..a0cbc01207c 100644 --- a/sky/core.py +++ b/sky/core.py @@ -299,7 +299,7 @@ def stop(cluster_name: str, purge: bool = False) -> None: sky.exceptions.NotSupportedError: if the specified cluster is a spot cluster, or a TPU VM Pod cluster, or the managed spot controller. """ - if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: + if backend_utils.ReservedClusterGroup.get_group(cluster_name) is not None: raise exceptions.NotSupportedError( f'Stopping sky reserved cluster {cluster_name!r} ' f'is not supported.') @@ -422,7 +422,7 @@ def autostop( if is_cancel: option_str = '{stop,down}' operation = f'{verb} auto{option_str}' - if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: + if backend_utils.ReservedClusterGroup.get_group(cluster_name) is not None: raise exceptions.NotSupportedError( f'{operation} sky reserved cluster {cluster_name!r} ' f'is not supported.') From 4c52014e0550153cde71d1738a8b384db374e856 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 13 Sep 2023 17:28:23 -0700 Subject: [PATCH 061/223] fix --- sky/backends/backend_utils.py | 6 ++++++ sky/cli.py | 18 +++--------------- sky/core.py | 9 ++++++--- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index b5e8f5e358a..74685e344e2 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -127,6 +127,10 @@ class ReservedClusterGroup(enum.Enum): f'sky spot queue{colorama.Style.RESET_ALL}'), decline_stop_hint=('Spot controller will be auto-stopped after all ' 'spot jobs finish.'), + decline_cancel_hint=( + 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' + f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel [--all]{colorama.Style.RESET_ALL}'), check_cluster_name_hint=( f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' 'managed spot controller. ')) @@ -138,6 +142,8 @@ class ReservedClusterGroup(enum.Enum): f'sky serve status{colorama.Style.RESET_ALL}'), decline_stop_hint=(f'To teardown a service, use {colorama.Style.BRIGHT}' f'sky serve down{colorama.Style.RESET_ALL}.'), + decline_cancel_hint=( + 'Cancelling the sky serve controller\'s jobs is not allowed.'), check_cluster_name_hint=( f'Cluster prefix {serve_lib.CONTROLLER_PREFIX} is reserved for ' 'sky serve controller. ')) diff --git a/sky/cli.py b/sky/cli.py index 4ce6159fb79..d8b9b23d74d 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -51,7 +51,6 @@ from sky import core from sky import exceptions from sky import global_user_state -from sky import serve as serve_lib from sky import sky_logging from sky import spot as spot_lib from sky import status_lib @@ -2102,8 +2101,6 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa Job IDs can be looked up by ``sky queue cluster_name``. """ - bold = colorama.Style.BRIGHT - reset = colorama.Style.RESET_ALL job_identity_str = None job_ids_to_cancel = None if not jobs and not all: @@ -2131,18 +2128,9 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) except exceptions.NotSupportedError: - if cluster == spot_lib.SPOT_CONTROLLER_NAME: - # Friendly message for usage like 'sky cancel - # -a/'. - error_str = ( - 'Cancelling the spot controller\'s jobs is not allowed.' - f'\nTo cancel spot jobs, use: {bold}sky spot cancel [--all]{reset}') - else: - assert cluster.startswith(serve_lib.CONTROLLER_PREFIX) - error_str = ( - 'Cancelling the sky serve controller\'s jobs is not allowed.') - click.echo(error_str) + group = backend_utils.ReservedClusterGroup.get_group(cluster) + assert group is not None + click.echo(group.value.decline_cancel_hint) sys.exit(1) except ValueError as e: raise click.UsageError(str(e)) diff --git a/sky/core.py b/sky/core.py index a0cbc01207c..26a227a3add 100644 --- a/sky/core.py +++ b/sky/core.py @@ -184,17 +184,20 @@ def _start( f'Starting cluster {cluster_name!r} with backend {backend.NAME} ' 'is not supported.') - if cluster_name == spot.SPOT_CONTROLLER_NAME: + if backend_utils.ReservedClusterGroup.get_group(cluster_name) is not None: if down: raise ValueError('Using autodown (rather than autostop) is not ' - 'supported for the spot controller. Pass ' + 'supported for skypilot controllers. Pass ' '`down=False` or omit it instead.') if idle_minutes_to_autostop is not None: raise ValueError( 'Passing a custom autostop setting is currently not ' - 'supported when starting the spot controller. To ' + 'supported when starting skypilot controllers. To ' 'fix: omit the `idle_minutes_to_autostop` argument to use the ' f'default autostop settings (got: {idle_minutes_to_autostop}).') + # TODO(tian): Maybe we should merge the two MINUTES_TO_AUTOSTOP + # together. Currently, the two value is the same so we just use spot + # constant here. idle_minutes_to_autostop = spot.SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP # NOTE: if spot_queue() calls _start() and hits here, that entrypoint From 755114f750b581118d35e5549bae29f532605ffa Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 13 Sep 2023 22:55:39 -0700 Subject: [PATCH 062/223] expand user --- sky/execution.py | 5 +++-- sky/serve/__init__.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 382c3c75ab1..c81d74e6373 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1035,8 +1035,9 @@ def serve_up( # TODO(tian): remove pylint disabling when filelock # version updated # pylint: disable=abstract-class-instantiated - with filelock.FileLock(serve.CONTROLLER_FILE_LOCK_PATH, - serve.CONTROLLER_FILE_LOCK_TIMEOUT): + with filelock.FileLock( + os.path.expanduser(serve.CONTROLLER_FILE_LOCK_PATH), + serve.CONTROLLER_FILE_LOCK_TIMEOUT): controller_name, _ = serve.get_available_controller_name( controller_resources) global_user_state.add_or_update_service( diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index ffabbf4d947..0659de4ec98 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -28,4 +28,4 @@ from sky.serve.serve_utils import ServiceHandle from sky.serve.service_spec import SkyServiceSpec -os.makedirs(SERVE_PREFIX, exist_ok=True) +os.makedirs(os.path.expanduser(SERVE_PREFIX), exist_ok=True) From b547b19e153a6ba1bda9a1c68a373e0613b1e5ce Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 14 Sep 2023 16:12:52 -0700 Subject: [PATCH 063/223] better UX for auto restart --- sky/cli.py | 12 ++++++++++-- sky/execution.py | 3 ++- sky/serve/serve_utils.py | 4 ++++ sky/utils/cli_utils/status_utils.py | 8 ++++++-- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d8b9b23d74d..287a16b995e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -77,6 +77,7 @@ from sky.utils.cli_utils import status_utils if typing.TYPE_CHECKING: + from sky import serve as serve_lib from sky.backends import backend as backend_lib logger = sky_logging.init_logger(__name__) @@ -4154,9 +4155,16 @@ def serve_status(all: bool, service_names: List[str]): f'Replicas{colorama.Style.RESET_ALL}') replica_infos = [] for service_record in service_records: + handle: 'serve_lib.ServiceHandle' = service_record['handle'] for replica_record in service_record['replica_info']: - replica_record['service_name'] = service_record['name'] - replica_infos.append(replica_record) + # Only print FAILED replicas if: + # 1. --all is specified; + # 2. auto_restart is not enabled (in which FAILED replica count + # as one replica). + if (all or not handle.auto_restart or replica_record['status'] != + status_lib.ReplicaStatus.FAILED): + replica_record['service_name'] = service_record['name'] + replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) failed_controllers = [ diff --git a/sky/execution.py b/sky/execution.py index c81d74e6373..3673e77964e 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1024,7 +1024,8 @@ def serve_up( service_name=service_name, policy=task.service.policy_str(), requested_resources=requested_resources, - requested_controller_resources=controller_resources) + requested_controller_resources=controller_resources, + auto_restart=task.service.auto_restart) # Use filelock here to make sure only one process can write to database # at the same time. Then we generate available controller name again to # make sure even in race condition, we can still get the correct controller diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 8f05b81c364..1058ffb7f34 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -299,6 +299,7 @@ class ServiceHandle(object): - (required) Service autoscaling policy description str. - (required) Service requested resources. - (required) Service requested controller resources. + - (required) Whether the service have auto restart enabled. - (optional) Service uptime. - (optional) Service endpoint IP. - (optional) Controller port. @@ -317,6 +318,7 @@ def __init__( policy: str, requested_resources: 'sky.Resources', requested_controller_resources: 'sky.Resources', + auto_restart: bool, uptime: Optional[int] = None, endpoint_ip: Optional[str] = None, controller_port: Optional[int] = None, @@ -331,6 +333,7 @@ def __init__( self.policy = policy self.requested_resources = requested_resources self.requested_controller_resources = requested_controller_resources + self.auto_restart = auto_restart self.controller_port = controller_port self.load_balancer_port = load_balancer_port self.job_id = job_id @@ -345,6 +348,7 @@ def __repr__(self): f'\n\trequested_resources={self.requested_resources},' '\n\trequested_controller_resources=' f'{self.requested_controller_resources},' + f'\n\tauto_restart={self.auto_restart},' f'\n\tcontroller_port={self.controller_port},' f'\n\tload_balancer_port={self.load_balancer_port},' f'\n\tjob_id={self.job_id},' diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 8df9f28b5e3..5b78609850a 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -421,11 +421,15 @@ def _get_uptime(service_record: _ServiceRecord) -> str: def _get_replicas(service_record: _ServiceRecord) -> str: - ready_replica_num = 0 + ready_replica_num, total_replica_num = 0, 0 + auto_restart = _get_service_handle(service_record).auto_restart for info in service_record['replica_info']: if _get_status(info) == status_lib.ReplicaStatus.READY: ready_replica_num += 1 - total_replica_num = len(service_record['replica_info']) + # If auto restart enabled, not count FAILED replicas here. + if (not auto_restart or + _get_status(info) != status_lib.ReplicaStatus.FAILED): + total_replica_num += 1 return f'{ready_replica_num}/{total_replica_num}' From f465e3217b7dd13040838c1254bb56a302b3beb7 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 14 Sep 2023 19:31:22 -0700 Subject: [PATCH 064/223] fix consecutive timeout threshold --- sky/serve/infra_providers.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 64ff4093bd3..de54f912bb7 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -27,8 +27,6 @@ _ENDPOINT_PROBE_INTERVAL = 10 # TODO(tian): Maybe let user determine this threshold _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180 -_CONSECUTIVE_FAILURE_THRESHOLD_COUNT = ( - _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT // _ENDPOINT_PROBE_INTERVAL) def _interrupt_process_and_children(pid: int) -> None: @@ -152,7 +150,7 @@ def __init__(self, replica_id: int, cluster_name: str) -> None: self.replica_id: int = replica_id self.cluster_name: str = cluster_name self.first_not_ready_time: Optional[float] = None - self.consecutive_failure_cnt: int = 0 + self.consecutive_failure_times: List[int] = [] self.status_property: ReplicaStatusProperty = ReplicaStatusProperty() @property @@ -624,30 +622,31 @@ def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: info = self.replica_info[cluster_name] info.status_property.service_ready_now = res if res: - info.consecutive_failure_cnt = 0 + info.consecutive_failure_times.clear() if not info.status_property.service_once_ready: info.status_property.service_once_ready = True continue if info.first_not_ready_time is None: info.first_not_ready_time = time.time() + current_time = time.time() + current_delay_seconds = current_time - info.first_not_ready_time if info.status_property.service_once_ready: - info.consecutive_failure_cnt += 1 - if (info.consecutive_failure_cnt >= - _CONSECUTIVE_FAILURE_THRESHOLD_COUNT): + info.consecutive_failure_times.append(current_time) + consecutive_failure_time = (info.consecutive_failure_times[-1] - + info.consecutive_failure_times[0]) + if (consecutive_failure_time >= + _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT): logger.info(f'Replica {cluster_name} is not ready for too ' 'long and exceeding consecutive failure ' 'threshold. Terminating the replica...') self._teardown_cluster(cluster_name) else: - current_unready_time = (info.consecutive_failure_cnt * - _ENDPOINT_PROBE_INTERVAL) logger.info(f'Replica {cluster_name} is not ready but ' 'within consecutive failure threshold ' - f'({current_unready_time}s / ' + f'({consecutive_failure_time}s / ' f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' 'Skipping.') else: - current_delay_seconds = time.time() - info.first_not_ready_time if current_delay_seconds > self.initial_delay_seconds: logger.info(f'Replica {cluster_name} is not ready and ' 'exceeding initial delay seconds. ' From c773d703a7d5dfc49d27f7d36a6da7c122583018 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 14 Sep 2023 19:35:10 -0700 Subject: [PATCH 065/223] minor --- sky/serve/infra_providers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index de54f912bb7..45360c77d91 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -626,10 +626,9 @@ def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: if not info.status_property.service_once_ready: info.status_property.service_once_ready = True continue - if info.first_not_ready_time is None: - info.first_not_ready_time = time.time() current_time = time.time() - current_delay_seconds = current_time - info.first_not_ready_time + if info.first_not_ready_time is None: + info.first_not_ready_time = current_time if info.status_property.service_once_ready: info.consecutive_failure_times.append(current_time) consecutive_failure_time = (info.consecutive_failure_times[-1] - @@ -647,6 +646,7 @@ def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' 'Skipping.') else: + current_delay_seconds = current_time - info.first_not_ready_time if current_delay_seconds > self.initial_delay_seconds: logger.info(f'Replica {cluster_name} is not ready and ' 'exceeding initial delay seconds. ' From 4eb1d44c4cdefda8f1196291f750dc326fcf451c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 16 Sep 2023 11:05:50 -0700 Subject: [PATCH 066/223] nnit --- sky/serve/serve_utils.py | 2 +- sky/serve/service_spec.py | 11 ++++------- sky/task.py | 3 ++- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 1058ffb7f34..eb4503c94a9 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -339,7 +339,7 @@ def __init__( self.job_id = job_id self.ephemeral_storage = ephemeral_storage - def __repr__(self): + def __repr__(self) -> str: return ('ServiceHandle(' f'\n\tservice_name={self.service_name},' f'\n\tuptime={self.uptime},' diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 7a26d240494..a3e78a0e3d2 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -27,7 +27,7 @@ def __init__( post_data: Optional[Dict[str, Any]] = None, controller_resources: Optional[Dict[str, Any]] = None, auto_restart: bool = False, - ): + ) -> None: if min_replicas < 0: with ux_utils.print_exception_no_traceback(): raise ValueError( @@ -58,10 +58,7 @@ def __init__( self._auto_restart = auto_restart @staticmethod - def from_yaml_config(config: Optional[Dict[str, Any]]): - if config is None: - return None - + def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': backend_utils.validate_schema(config, schemas.get_service_schema(), 'Invalid service YAML: ') if 'replicas' in config and 'replica_policy' in config: @@ -127,7 +124,7 @@ def from_yaml_config(config: Optional[Dict[str, Any]]): return SkyServiceSpec(**service_config) @staticmethod - def from_yaml(yaml_path: str): + def from_yaml(yaml_path: str) -> 'SkyServiceSpec': with open(os.path.expanduser(yaml_path), 'r') as f: config = yaml.safe_load(f) @@ -146,7 +143,7 @@ def from_yaml(yaml_path: str): return SkyServiceSpec.from_yaml_config(config['service']) - def to_yaml_config(self): + def to_yaml_config(self) -> Dict[str, Any]: config = dict() def add_if_not_none(section, key, value, no_empty: bool = False): diff --git a/sky/task.py b/sky/task.py index 0be28047ffd..9518d9d3664 100644 --- a/sky/task.py +++ b/sky/task.py @@ -412,7 +412,8 @@ def from_yaml_config( task.set_resources({resources}) service = config.pop('service', None) - service = serve_lib.SkyServiceSpec.from_yaml_config(service) + if service is not None: + service = serve_lib.SkyServiceSpec.from_yaml_config(service) task.set_service(service) assert not config, f'Invalid task args: {config.keys()}' From d83f9a2ef1753241e1fee8836856987a1d103352 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 18 Sep 2023 14:31:16 -0700 Subject: [PATCH 067/223] temp remove --- sky/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 287a16b995e..09d6f7441ed 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1754,7 +1754,6 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]): query_clusters = _get_glob_clusters(clusters) cluster_records = core.status(cluster_names=query_clusters, refresh=refresh) - hints = [] nonreserved_cluster_records = [] reserved_clusters = [] for cluster_record in cluster_records: From dd261d57fcc64b11d11fe9939956b4bfc03436cb Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 18 Sep 2023 14:31:57 -0700 Subject: [PATCH 068/223] add back --- sky/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/cli.py b/sky/cli.py index ee037b782fa..df368beb520 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1834,6 +1834,7 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, head_ip = handle.external_ips()[0] click.echo(head_ip) return + hints = [] nonreserved_cluster_records = [] reserved_clusters = [] for cluster_record in cluster_records: From 6f6d8e7e2eede6e1c9b6fa00f4ac400882cb07f2 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 18 Sep 2023 16:00:56 -0700 Subject: [PATCH 069/223] only open ports used --- sky/execution.py | 4 ++-- sky/serve/__init__.py | 1 - sky/serve/constants.py | 4 ---- sky/serve/serve_utils.py | 4 ---- sky/setup_files/setup.py | 4 +++- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 3673e77964e..059efdfd4db 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1007,8 +1007,6 @@ def serve_up( if 'ports' in controller_resources_config: with ux_utils.print_exception_no_traceback(): raise ValueError('Cannot specify ports for controller resources.') - # TODO(tian): Open required ports only after #2485 is merged. - controller_resources_config['ports'] = [serve.LOAD_BALANCER_PORT_RANGE] try: controller_resources = sky.Resources.from_yaml_config( controller_resources_config) @@ -1053,6 +1051,8 @@ def serve_up( service_handle.controller_port = controller_port service_handle.load_balancer_port = load_balancer_port global_user_state.set_service_handle(service_name, service_handle) + controller_resources = controller_resources.copy( + ports=[load_balancer_port]) except filelock.Timeout as e: with ux_utils.print_exception_no_traceback(): raise RuntimeError( diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 0659de4ec98..e05b1711593 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -8,7 +8,6 @@ from sky.serve.constants import CONTROLLER_RESOURCES from sky.serve.constants import CONTROLLER_SYNC_INTERVAL from sky.serve.constants import CONTROLLER_TEMPLATE -from sky.serve.constants import LOAD_BALANCER_PORT_RANGE from sky.serve.constants import SERVE_PREFIX from sky.serve.constants import SERVE_STARTUP_TIMEOUT from sky.serve.constants import SERVICE_NAME_VALID_REGEX diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 6ae672c94e7..993161fc716 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -67,7 +67,3 @@ # automatically generated from this start port. CONTROLLER_PORT_START = 20001 LOAD_BALANCER_PORT_START = 30001 - -# Ports to open for controller VM. We open ~1000 ports for controller to ensure -# services can be started on the same controller without port conflicts. -LOAD_BALANCER_PORT_RANGE = '30001-31000' diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index eb4503c94a9..2f6b9751739 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -160,10 +160,6 @@ def gen_ports_for_serve_process(controller_name: str) -> Tuple[int, int]: existing_controller_ports.add(service_handle.controller_port) if service_handle.load_balancer_port is not None: existing_load_balancer_ports.add(service_handle.load_balancer_port) - # Cannot expose controller to public internet. - # We opened 30001-31000 for controller VM, so load balancer port - # should be in this range and controller port should not be in - # this range. controller_port = constants.CONTROLLER_PORT_START while controller_port in existing_controller_ports: controller_port += 1 diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index 916dc7ee7a7..67063c1c07c 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -135,7 +135,9 @@ def parse_readme(readme: str) -> str: 'azure-cli>=2.31.0', 'azure-core', 'azure-identity>=1.13.0', 'azure-mgmt-network' ], - 'gcp': ['google-api-python-client', 'google-cloud-storage'], + # We need google-api-python-client>=2.19.1 to enable 'reason' attribute + # of googleapiclient.errors.HttpError, which is widely used in our system. + 'gcp': ['google-api-python-client >= 2.19.1', 'google-cloud-storage'], 'ibm': [ 'ibm-cloud-sdk-core', 'ibm-vpc', 'ibm-platform-services', 'ibm-cos-sdk' ], From 4037e7e7b40d205a4b2004e0642453a7433937bf Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 18 Sep 2023 22:28:57 -0700 Subject: [PATCH 070/223] remove redundant task yaml for load balancer --- sky/serve/load_balancer.py | 4 ---- sky/templates/sky-serve-controller.yaml.j2 | 5 ++--- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 687fbffb574..7c5317ae941 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -114,10 +114,6 @@ def run(self): if __name__ == '__main__': # Add argparse parser = argparse.ArgumentParser(description='SkyServe Load Balancer') - parser.add_argument('--task-yaml', - type=str, - help='Task YAML file', - required=True) parser.add_argument('--load-balancer-port', type=int, help='Port to run the load balancer on.', diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index c8e6d983bad..f041ab5ac52 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -27,7 +27,6 @@ run: | # Start sky serve load balancer. We keep the load balancer running in the # foreground so that the job will not finish, thus prevent our controller # from auto down. - python -u -m sky.serve.load_balancer --task-yaml {{remote_task_yaml_path}} \ - --load-balancer-port {{load_balancer_port}} --app-port {{app_port}} \ - --controller-addr http://localhost:{{controller_port}} \ + python -u -m sky.serve.load_balancer --load-balancer-port {{load_balancer_port}} \ + --app-port {{app_port}} --controller-addr http://localhost:{{controller_port}} \ > {{load_balancer_log_file}} 2>&1 From 05784873a85490e721510a3ed15bfe38d6f95b11 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 20 Sep 2023 10:30:59 -0700 Subject: [PATCH 071/223] move task ports handling to python API --- sky/cli.py | 2 -- sky/execution.py | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index df368beb520..9b467f1b89b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4120,8 +4120,6 @@ def serve_up( raise ValueError( 'Specifying ports in resources is not allowed. SkyServe will ' 'use the port specified in the service section.') - app_port = int(task.service.app_port) - task.set_resources(requested_resources.copy(ports=[app_port])) click.secho('\nService Spec:', fg='cyan') click.echo(task.service) diff --git a/sky/execution.py b/sky/execution.py index 059efdfd4db..43f76ddafed 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1018,6 +1018,14 @@ def serve_up( assert task.service is not None, task assert len(task.resources) == 1, task requested_resources = list(task.resources)[0] + if requested_resources.ports is not None: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Specifying ports in resources is not allowed. SkyServe will ' + 'use the port specified in the service section.') + + task.set_resources(requested_resources.copy(ports=[task.service.app_port])) + service_handle = serve.ServiceHandle( service_name=service_name, policy=task.service.policy_str(), From a1004338830dc79d2e219f79e68d2ed604f96ef1 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 25 Sep 2023 21:41:30 -0700 Subject: [PATCH 072/223] fix controller generation bug --- sky/serve/serve_utils.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 2f6b9751739..b5e51b259d7 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -75,10 +75,27 @@ def values(self): def get_existing_controller_names() -> Set[str]: - return { + """Get existing sky serve controller names. + + There is two possible indicators for a controller: + 1. It is in the cluster database, which means it is already created; + 2. It is in the service database, which means it will be created + later in the future. This usually happens when multiple `sky serve up` + are running simultaneously. + + Returns: + A set of existing sky serve controller names. + """ + controller_in_service_db = { record['controller_name'] for record in global_user_state.get_services() } + controller_in_cluster_db = { + record['name'] + for record in global_user_state.get_clusters() + if record['name'].startswith(constants.CONTROLLER_PREFIX) + } + return controller_in_service_db | controller_in_cluster_db def generate_controller_cluster_name(existing_controllers: Set[str]) -> str: From 8f49ff981a6bb0cb0b579466761ba474c59315c9 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 25 Sep 2023 21:41:36 -0700 Subject: [PATCH 073/223] UX nits --- sky/cli.py | 3 +++ sky/core.py | 45 +++++++++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 9b467f1b89b..80b9fcd62af 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2748,6 +2748,9 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): services = global_user_state.get_services_from_controller_name( controller_name) if services: + # TODO(tian): When we switch to database for storing replica + # information, we could check total replicas of each service and + # allow terminating the controller if there is no existing replicas. service_names = [service['name'] for service in services] with ux_utils.print_exception_no_traceback(): plural = '' if len(service_names) == 1 else 's' diff --git a/sky/core.py b/sky/core.py index 26a227a3add..cacd174702f 100644 --- a/sky/core.py +++ b/sky/core.py @@ -21,6 +21,7 @@ from sky.skylet import constants from sky.skylet import job_lib from sky.usage import usage_lib +from sky.utils import common_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import tpu_utils @@ -1122,6 +1123,10 @@ def serve_down(service_name: str, purge: bool = False) -> None: status_lib.ServiceStatus.SHUTTING_DOWN) handle = global_user_state.get_handle_from_cluster_name(controller_name) + controller_fetch_ip_error_message = ( + 'Failed to fetch controller IP. Please ' + 'check controller status and try again.') + if handle is not None: backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) @@ -1134,12 +1139,17 @@ def serve_down(service_name: str, purge: bool = False) -> None: code = serve.ServeCodeGen.terminate_service( service_handle.controller_port) - returncode, terminate_service_payload, stderr = backend.run_on_head( - handle, - code, - require_outputs=True, - stream_logs=False, - separate_stderr=True) + + try: + (returncode, terminate_service_payload, + stderr) = backend.run_on_head(handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + except exceptions.FetchIPError as e: + raise RuntimeError(controller_fetch_ip_error_message) from e + subprocess_utils.handle_returncode( returncode, code, ('Failed when submit terminate request to controller ' @@ -1165,10 +1175,12 @@ def serve_down(service_name: str, purge: bool = False) -> None: # We want to make sure no matter what error happens, we can still # clean up the record if purge is True. - except Exception as e: # pylint: disable=broad-except + # pylint: disable=broad-except + except Exception as e: if purge: - logger.warning('Ignoring error when cleaning replicas of ' - f'{service_name!r}: {e}') + logger.warning('Ignoring error when cleaning ' + f'replicas of {service_name!r}: ' + f'{common_utils.format_exception(e)}') else: with ux_utils.print_exception_no_traceback(): raise RuntimeError(e) from e @@ -1189,7 +1201,11 @@ def serve_down(service_name: str, purge: bool = False) -> None: jobs = [] if service_handle.job_id is not None: jobs.append(service_handle.job_id) - backend.cancel_jobs(handle, jobs=jobs) + + try: + backend.cancel_jobs(handle, jobs=jobs) + except exceptions.FetchIPError as e: + raise RuntimeError(controller_fetch_ip_error_message) from e # Cleanup all files on controller related to this service. # We have a 10-min grace period for the controller to autostop, @@ -1209,11 +1225,12 @@ def serve_down(service_name: str, purge: bool = False) -> None: stream_logs=False) # same as above. - except Exception as e: # pylint: disable=broad-except + # pylint: disable=broad-except + except Exception as e: if purge: - logger.warning( - 'Ignoring error when stopping controller and ' - f'load balancer jobs of service {service_name!r}: {e}') + logger.warning('Ignoring error when stopping controller and ' + f'load balancer jobs of service {service_name!r}: ' + f'{common_utils.format_exception(e)}') else: with ux_utils.print_exception_no_traceback(): raise RuntimeError(e) from e From 4dd0e8e512bf0fad3b7c5a578e6a96f3126c7349 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 26 Sep 2023 13:02:57 -0700 Subject: [PATCH 074/223] nit --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 80b9fcd62af..17201e82c2f 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2866,7 +2866,7 @@ def _down_or_stop_clusters( hint_or_raise(reserved_cluster) confirm_str = 'delete' user_input = click.prompt( - f'To proceed, please check the warning above and type ' + f'To proceed, please check the information above and type ' f'{colorama.Style.BRIGHT}{confirm_str!r}' f'{colorama.Style.RESET_ALL}', type=str) From 20dcd1b812439a296a9fac4a7670e1fe26d1c5fc Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 27 Sep 2023 14:33:53 -0700 Subject: [PATCH 075/223] Fix sky serve down --purge when storage cleanup failed --- sky/core.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sky/core.py b/sky/core.py index cacd174702f..f75b959acd1 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1242,5 +1242,13 @@ def serve_down(service_name: str, purge: bool = False) -> None: os.remove(controller_yaml_path) handle = global_user_state.get_handle_from_service_name(service_name) assert handle is not None - handle.cleanup_ephemeral_storage() + try: + handle.cleanup_ephemeral_storage() + # same as above. + except Exception as e: # pylint: disable=broad-except + if purge: + logger.warning('Ignoring error when cleaning up ephemeral storage ' + f'of service {service_name}: {e}') + else: + raise RuntimeError(e) from e global_user_state.remove_service(service_name) From 672840c475f4b01875c20c873fd216d9355a8d60 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 27 Sep 2023 14:45:29 -0700 Subject: [PATCH 076/223] ux --- sky/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 17201e82c2f..e0fd7bef5a9 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4262,8 +4262,9 @@ def serve_status(all: bool, service_names: List[str]): if failed_controllers: num_failed = len(failed_controllers) plural = '' if num_failed == 1 else 's' - click.echo(f'\n* {num_failed} service{plural} with failed controller ' - 'found. The replica info and number might not be accurate.') + click.echo( + f'\n* {num_failed} service{plural} with failed controller found. ' + 'Please manually check if there is any leaked resources.') @serve.command('down', cls=_DocumentedCodeCommand) From a8bec0dbad949c5c6aa17d3dce06e6bfacd33f1c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 28 Sep 2023 10:45:48 -0700 Subject: [PATCH 077/223] reuse service handle --- sky/core.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sky/core.py b/sky/core.py index f75b959acd1..eb3ac256172 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1240,10 +1240,8 @@ def serve_down(service_name: str, purge: bool = False) -> None: service_name) if os.path.exists(controller_yaml_path): os.remove(controller_yaml_path) - handle = global_user_state.get_handle_from_service_name(service_name) - assert handle is not None try: - handle.cleanup_ephemeral_storage() + service_handle.cleanup_ephemeral_storage() # same as above. except Exception as e: # pylint: disable=broad-except if purge: From 7bb01f6810e1817a3f88f76bfddbc91038d40474 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 28 Sep 2023 10:51:34 -0700 Subject: [PATCH 078/223] revert --- sky/setup_files/setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index 67063c1c07c..916dc7ee7a7 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -135,9 +135,7 @@ def parse_readme(readme: str) -> str: 'azure-cli>=2.31.0', 'azure-core', 'azure-identity>=1.13.0', 'azure-mgmt-network' ], - # We need google-api-python-client>=2.19.1 to enable 'reason' attribute - # of googleapiclient.errors.HttpError, which is widely used in our system. - 'gcp': ['google-api-python-client >= 2.19.1', 'google-cloud-storage'], + 'gcp': ['google-api-python-client', 'google-cloud-storage'], 'ibm': [ 'ibm-cloud-sdk-core', 'ibm-vpc', 'ibm-platform-services', 'ibm-cos-sdk' ], From fbd2873dae6671a347f9b1952890c08dd384b37a Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 28 Sep 2023 11:52:33 -0700 Subject: [PATCH 079/223] add todo --- sky/serve/infra_providers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 45360c77d91..875b0c0f6ed 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -413,6 +413,8 @@ def _launch_cluster(self, replica_id: int) -> None: 'already exists. Skipping.') return logger.info(f'Creating SkyPilot cluster {cluster_name}') + # TODO(tian): We should do usage_lib.messages.usage.set_internal() + # after we change to python API. cmd = ['sky', 'launch', self.task_yaml_path, '-c', cluster_name, '-y'] cmd.extend(['--detach-setup', '--detach-run', '--retry-until-up']) fn = serve_utils.generate_replica_launch_log_file_name( From fbde4aa448aec3fa0af3cd1f654829a92b211a29 Mon Sep 17 00:00:00 2001 From: Isaac Ong Date: Thu, 28 Sep 2023 14:58:43 -0700 Subject: [PATCH 080/223] [SkyServe] Add Ray Serve example (#2621) * Add Ray Serve example * Update serve YAML --- examples/serve/ray_serve/ray_serve.yaml | 13 +++++++++++++ examples/serve/ray_serve/serve.py | 17 +++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 examples/serve/ray_serve/ray_serve.yaml create mode 100644 examples/serve/ray_serve/serve.py diff --git a/examples/serve/ray_serve/ray_serve.yaml b/examples/serve/ray_serve/ray_serve.yaml new file mode 100644 index 00000000000..72c0ce92b12 --- /dev/null +++ b/examples/serve/ray_serve/ray_serve.yaml @@ -0,0 +1,13 @@ +resources: + cpus: 2+ + +workdir: examples/serve/ray_serve + +setup: pip install "ray[serve]" + +run: serve run serve:app --host 0.0.0.0 + +service: + port: 8000 + readiness_probe: / + replicas: 1 diff --git a/examples/serve/ray_serve/serve.py b/examples/serve/ray_serve/serve.py new file mode 100644 index 00000000000..a08918cf0f0 --- /dev/null +++ b/examples/serve/ray_serve/serve.py @@ -0,0 +1,17 @@ +from typing import Dict + +from ray import serve +from starlette.requests import Request + + +@serve.deployment(route_prefix="/", num_replicas=2) +class ModelDeployment: + + def __init__(self, msg: str): + self._msg = msg + + def __call__(self, request: Request) -> Dict: + return {"result": self._msg} + + +app = ModelDeployment.bind(msg="Hello Ray Serve!") From fa4981a73fa35a9f11ca9a7c964f219008462713 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 10:16:50 -0700 Subject: [PATCH 081/223] restore job id type --- sky/backends/cloud_vm_ray_backend.py | 2 +- sky/core.py | 2 +- sky/execution.py | 2 +- sky/serve/infra_providers.py | 2 +- sky/skylet/job_lib.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 115e6388d4b..be3b3377d47 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3480,7 +3480,7 @@ def get_job_status( handle: CloudVmRayResourceHandle, job_ids: Optional[List[int]] = None, stream_logs: bool = True - ) -> Dict[Optional[str], Optional[job_lib.JobStatus]]: + ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]: code = job_lib.JobLibCodeGen.get_job_status(job_ids) returncode, stdout, stderr = self.run_on_head(handle, code, diff --git a/sky/core.py b/sky/core.py index eb3ac256172..e05c4c4b136 100644 --- a/sky/core.py +++ b/sky/core.py @@ -702,7 +702,7 @@ def download_logs( def job_status(cluster_name: str, job_ids: Optional[List[int]], stream_logs: bool = False - ) -> Dict[Optional[str], Optional[job_lib.JobStatus]]: + ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Get the status of jobs. diff --git a/sky/execution.py b/sky/execution.py index 43f76ddafed..cdc3f5f8fcd 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1160,7 +1160,7 @@ def _wait_until_job_is_running_on_controller( for _ in range(serve.SERVE_STARTUP_TIMEOUT): job_statuses = backend.get_job_status(handle, [job_id], stream_logs=False) - job_status = job_statuses.get(str(job_id), None) + job_status = job_statuses.get(job_id, None) if job_status == job_lib.JobStatus.RUNNING: return True time.sleep(1) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 875b0c0f6ed..102ef8e3713 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -341,7 +341,7 @@ def _fetch_job_status(self) -> None: # Only fetch job 1, which stands for user task job job_statuses = backend.get_job_status(handle, [1], stream_logs=False) - job_status = job_statuses['1'] + job_status = job_statuses[1] if job_status in [ job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP ]: diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py index 5126f40b960..44787ebd429 100644 --- a/sky/skylet/job_lib.py +++ b/sky/skylet/job_lib.py @@ -367,7 +367,7 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str: def load_statuses_payload( - statuses_payload: str) -> Dict[Optional[str], Optional[JobStatus]]: + statuses_payload: str) -> Dict[Optional[int], Optional[JobStatus]]: statuses = common_utils.decode_payload(statuses_payload) for job_id, status in statuses.items(): if status is not None: From 5704d0562d3d2d8c7f9a8ba3ff14f6e326b8efd4 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 3 Oct 2023 10:31:16 -0700 Subject: [PATCH 082/223] Apply suggestions from code review Co-authored-by: Zhanghao Wu --- sky/backends/backend_utils.py | 7 ++++++- sky/cli.py | 7 ++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index ee8090dae5b..feb658f4463 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -151,6 +151,11 @@ class ReservedClusterGroup(enum.Enum): @classmethod def get_group(cls, name: str) -> Optional['ReservedClusterGroup']: + """Get the reserved group of a cluster with its name + + Returns the group name if the cluster name is reserved. Otherwise, + returns None. + """ for group in cls: if group.value.check(name): return group @@ -2743,7 +2748,7 @@ def _refresh_service_record_no_lock( cluster_record = global_user_state.get_cluster_from_name(controller_name) # We don't check controller status here since it might be in INIT status - # when other services is starting up and launching the controller. + # when other services is starting and launching the controller. if cluster_record is None: global_user_state.set_service_status( service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) diff --git a/sky/cli.py b/sky/cli.py index dd3ba78d530..23c87180cfe 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2213,7 +2213,7 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) except exceptions.NotSupportedError: group = backend_utils.ReservedClusterGroup.get_group(cluster) - assert group is not None + assert group is not None, cluster click.echo(group.value.decline_cancel_hint) sys.exit(1) except ValueError as e: @@ -4108,7 +4108,8 @@ def serve_up( else: prompt = (f'Service {service_name!r} already exists. ' 'Updating a service will be supported in the future. ' - 'For now, `sky serve down` first and try again.') + f'For now, clean up the service and restart: ' + f'sky serve down {service_name}') with ux_utils.print_exception_no_traceback(): raise RuntimeError(prompt) @@ -4267,7 +4268,7 @@ def serve_status(all: bool, service_names: List[str]): plural = '' if num_failed == 1 else 's' click.echo( f'\n* {num_failed} service{plural} with failed controller found. ' - 'Please manually check if there is any leaked resources.') + 'Please manually check if there is any leaked resources for services: {", ".join(failed_controllers)}.') @serve.command('down', cls=_DocumentedCodeCommand) From e47e005b9348f4377c19b71cb4b7db4019541526 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 10:39:12 -0700 Subject: [PATCH 083/223] lint --- sky/backends/backend_utils.py | 2 +- sky/cli.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index feb658f4463..0b9ce6672e8 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -152,7 +152,7 @@ class ReservedClusterGroup(enum.Enum): @classmethod def get_group(cls, name: str) -> Optional['ReservedClusterGroup']: """Get the reserved group of a cluster with its name - + Returns the group name if the cluster name is reserved. Otherwise, returns None. """ diff --git a/sky/cli.py b/sky/cli.py index 23c87180cfe..0aa083c6a72 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4108,7 +4108,7 @@ def serve_up( else: prompt = (f'Service {service_name!r} already exists. ' 'Updating a service will be supported in the future. ' - f'For now, clean up the service and restart: ' + 'For now, clean up the service and restart: ' f'sky serve down {service_name}') with ux_utils.print_exception_no_traceback(): raise RuntimeError(prompt) @@ -4268,7 +4268,8 @@ def serve_status(all: bool, service_names: List[str]): plural = '' if num_failed == 1 else 's' click.echo( f'\n* {num_failed} service{plural} with failed controller found. ' - 'Please manually check if there is any leaked resources for services: {", ".join(failed_controllers)}.') + 'Please manually check if there is any leaked resources for ' + f'services: {", ".join(failed_controllers)}.') @serve.command('down', cls=_DocumentedCodeCommand) From a86523ce2090bea61ab02e7319036323cf1bed9b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 10:52:32 -0700 Subject: [PATCH 084/223] move service section to the top --- examples/serve/gorilla/gorilla.yaml | 10 +++++----- examples/serve/http_server/task.yaml | 14 +++++++------- examples/serve/llama2/llama2.yaml | 10 +++++----- examples/serve/ray_serve/ray_serve.yaml | 10 +++++----- examples/serve/stable_diffusion_service.yaml | 6 +++--- examples/serve/tgi_coder.yaml | 6 +++--- examples/serve/vicuna-v1.5.yaml | 10 +++++----- examples/serve/vllm.yaml | 6 +++--- 8 files changed, 36 insertions(+), 36 deletions(-) diff --git a/examples/serve/gorilla/gorilla.yaml b/examples/serve/gorilla/gorilla.yaml index b08abb4b2c8..10d2976093e 100644 --- a/examples/serve/gorilla/gorilla.yaml +++ b/examples/serve/gorilla/gorilla.yaml @@ -1,8 +1,3 @@ -resources: - accelerators: A100:1 - disk_size: 1024 - disk_tier: high - service: port: 8087 readiness_probe: @@ -10,6 +5,11 @@ service: initial_delay_seconds: 1800 replicas: 2 +resources: + accelerators: A100:1 + disk_size: 1024 + disk_tier: high + setup: | conda activate chatbot if [ $? -ne 0 ]; then diff --git a/examples/serve/http_server/task.yaml b/examples/serve/http_server/task.yaml index 7faeec72eaa..3c4d9046b45 100644 --- a/examples/serve/http_server/task.yaml +++ b/examples/serve/http_server/task.yaml @@ -1,13 +1,13 @@ -resources: - cpus: 2+ - -workdir: examples/serve/http_server - -run: python3 server.py - service: port: 8081 readiness_probe: path: /health initial_delay_seconds: 20 replicas: 2 + +resources: + cpus: 2+ + +workdir: examples/serve/http_server + +run: python3 server.py diff --git a/examples/serve/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml index 4647b7b4eda..cb6a70c1adb 100644 --- a/examples/serve/llama2/llama2.yaml +++ b/examples/serve/llama2/llama2.yaml @@ -1,14 +1,14 @@ +service: + port: 8087 + readiness_probe: /v1/models + replicas: 2 + resources: memory: 32+ accelerators: T4:1 disk_size: 1024 disk_tier: high -service: - port: 8087 - readiness_probe: /v1/models - replicas: 2 - envs: MODEL_SIZE: 7 HF_TOKEN: # TODO: Replace with huggingface token diff --git a/examples/serve/ray_serve/ray_serve.yaml b/examples/serve/ray_serve/ray_serve.yaml index 72c0ce92b12..c47c8b74be1 100644 --- a/examples/serve/ray_serve/ray_serve.yaml +++ b/examples/serve/ray_serve/ray_serve.yaml @@ -1,3 +1,8 @@ +service: + port: 8000 + readiness_probe: / + replicas: 1 + resources: cpus: 2+ @@ -6,8 +11,3 @@ workdir: examples/serve/ray_serve setup: pip install "ray[serve]" run: serve run serve:app --host 0.0.0.0 - -service: - port: 8000 - readiness_probe: / - replicas: 1 diff --git a/examples/serve/stable_diffusion_service.yaml b/examples/serve/stable_diffusion_service.yaml index c96ee9c77ed..405b3bc7407 100644 --- a/examples/serve/stable_diffusion_service.yaml +++ b/examples/serve/stable_diffusion_service.yaml @@ -3,14 +3,14 @@ # Usage: # .. -resources: - accelerators: V100:1 - service: port: 7860 readiness_probe: / replicas: 2 +resources: + accelerators: V100:1 + file_mounts: /stable_diffusion: examples/stable_diffusion diff --git a/examples/serve/tgi_coder.yaml b/examples/serve/tgi_coder.yaml index a64bf41f226..71f109d1179 100644 --- a/examples/serve/tgi_coder.yaml +++ b/examples/serve/tgi_coder.yaml @@ -1,11 +1,11 @@ -resources: - accelerators: A100:1 - service: port: 8082 readiness_probe: /health replicas: 2 +resources: + accelerators: A100:1 + # TODO(tian): Maybe use some small model like 3b. run: | docker run --gpus all --shm-size 1g -p 8082:80 -v ~/data:/data ghcr.io/huggingface/text-generation-inference --model-id WizardLM/WizardCoder-15B-V1.0 diff --git a/examples/serve/vicuna-v1.5.yaml b/examples/serve/vicuna-v1.5.yaml index d36b571e6b4..6ca32de5ba8 100644 --- a/examples/serve/vicuna-v1.5.yaml +++ b/examples/serve/vicuna-v1.5.yaml @@ -1,13 +1,13 @@ -resources: - accelerators: A100:1 - disk_size: 1024 - disk_tier: high - service: port: 8087 readiness_probe: /v1/models replicas: 2 +resources: + accelerators: A100:1 + disk_size: 1024 + disk_tier: high + envs: MODEL_SIZE: 13 diff --git a/examples/serve/vllm.yaml b/examples/serve/vllm.yaml index e74b546b0f0..01f63ccc33e 100644 --- a/examples/serve/vllm.yaml +++ b/examples/serve/vllm.yaml @@ -1,6 +1,3 @@ -resources: - accelerators: A100:1 - service: port: 8081 readiness_probe: @@ -9,6 +6,9 @@ service: initial_delay_seconds: 1200 replicas: 2 +resources: + accelerators: A100:1 + setup: | conda activate chatbot if [ $? -eq 0 ]; then From 27a29b4bf5ec56c99be8c39c860572ea43d2482d Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 10:56:35 -0700 Subject: [PATCH 085/223] add docstr --- sky/backends/backend.py | 7 +++++++ sky/backends/local_docker_backend.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/sky/backends/backend.py b/sky/backends/backend.py index c5e76db026f..69e200e1432 100644 --- a/sky/backends/backend.py +++ b/sky/backends/backend.py @@ -87,6 +87,13 @@ def execute(self, task: 'task_lib.Task', detach_run: bool, dryrun: bool = False) -> Optional[int]: + """Execute a job on the cluster. + + Returns: + The job id if the job is submitted successfully, None otherwise. + Job id is a CloudVMRayBackend-specific concept, so all other + backends should return None. + """ usage_lib.record_cluster_name_for_current_operation( handle.get_cluster_name()) usage_lib.messages.usage.update_actual_task(task) diff --git a/sky/backends/local_docker_backend.py b/sky/backends/local_docker_backend.py index ce8009ed949..e9d8b8add06 100644 --- a/sky/backends/local_docker_backend.py +++ b/sky/backends/local_docker_backend.py @@ -269,7 +269,7 @@ def _execute(self, task: 'task_lib.Task', detach_run: bool, dryrun: bool = False) -> Optional[int]: - """ Launches the container. + """Launches the container. Returns: The job id if the job is submitted successfully. LocalDockerBackend From ecc0640bb9efdfd053b5c79a649a4ff376e1525b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 11:17:13 -0700 Subject: [PATCH 086/223] make controller port not optinal --- sky/backends/backend_utils.py | 10 +--------- sky/backends/cloud_vm_ray_backend.py | 5 ----- sky/core.py | 6 ------ sky/execution.py | 24 ++++++++++++------------ sky/serve/serve_utils.py | 23 ++++++++++------------- 5 files changed, 23 insertions(+), 45 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 0b9ce6672e8..64fbc590475 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2749,20 +2749,12 @@ def _refresh_service_record_no_lock( # We don't check controller status here since it might be in INIT status # when other services is starting and launching the controller. - if cluster_record is None: - global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - return record, None + assert cluster_record is not None handle = cluster_record['handle'] backend = get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) - if service_handle.controller_port is None: - global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - return record, None - code = serve_lib.ServeCodeGen.get_latest_info( service_handle.controller_port) returncode, latest_info_payload, _ = backend.run_on_head( diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 830f0f6405d..1cc975f9695 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3800,11 +3800,6 @@ def tail_serve_logs(self, handle: CloudVmRayResourceHandle, stream_controller=controller, follow=follow) else: - if service_handle.controller_port is None: - logger.warning('Controller task is not successfully launched ' - f'for service {service_handle.service_name!r}. ' - 'Cannot stream logs.') - return assert replica_id is not None, service_handle code = serve_lib.ServeCodeGen.stream_replica_logs( service_handle.service_name, service_handle.controller_port, diff --git a/sky/core.py b/sky/core.py index e05c4c4b136..e466ebc0a4f 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1131,12 +1131,6 @@ def serve_down(service_name: str, purge: bool = False) -> None: backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) try: - if service_handle.controller_port is None: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'Controller job of service {service_name!r} ' - 'not found.') - code = serve.ServeCodeGen.terminate_service( service_handle.controller_port) diff --git a/sky/execution.py b/sky/execution.py index f109b09a7ca..ea9721e5048 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1016,12 +1016,6 @@ def serve_up( task.set_resources(requested_resources.copy(ports=[task.service.app_port])) - service_handle = serve.ServiceHandle( - service_name=service_name, - policy=task.service.policy_str(), - requested_resources=requested_resources, - requested_controller_resources=controller_resources, - auto_restart=task.service.auto_restart) # Use filelock here to make sure only one process can write to database # at the same time. Then we generate available controller name again to # make sure even in race condition, we can still get the correct controller @@ -1037,18 +1031,24 @@ def serve_up( serve.CONTROLLER_FILE_LOCK_TIMEOUT): controller_name, _ = serve.get_available_controller_name( controller_resources) + controller_port, load_balancer_port = ( + serve.gen_ports_for_serve_process(controller_name)) + + service_handle = serve.ServiceHandle( + service_name=service_name, + policy=task.service.policy_str(), + requested_resources=requested_resources, + requested_controller_resources=controller_resources, + auto_restart=task.service.auto_restart, + controller_port=controller_port, + load_balancer_port=load_balancer_port) + global_user_state.add_or_update_service( service_name, launched_at=int(time.time()), controller_name=controller_name, handle=service_handle, status=status_lib.ServiceStatus.CONTROLLER_INIT) - - controller_port, load_balancer_port = ( - serve.gen_ports_for_serve_process(controller_name)) - service_handle.controller_port = controller_port - service_handle.load_balancer_port = load_balancer_port - global_user_state.set_service_handle(service_name, service_handle) controller_resources = controller_resources.copy( ports=[load_balancer_port]) except filelock.Timeout as e: diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index b5e51b259d7..6a4b3de62ef 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -169,14 +169,11 @@ def get_replica_id_from_cluster_name(cluster_name: str) -> int: def gen_ports_for_serve_process(controller_name: str) -> Tuple[int, int]: services = global_user_state.get_services_from_controller_name( controller_name) - # Use `is None` to filter out self and all services with initialize status existing_controller_ports, existing_load_balancer_ports = set(), set() for service in services: service_handle: ServiceHandle = service['handle'] - if service_handle.controller_port is not None: - existing_controller_ports.add(service_handle.controller_port) - if service_handle.load_balancer_port is not None: - existing_load_balancer_ports.add(service_handle.load_balancer_port) + existing_controller_ports.add(service_handle.controller_port) + existing_load_balancer_ports.add(service_handle.load_balancer_port) controller_port = constants.CONTROLLER_PORT_START while controller_port in existing_controller_ports: controller_port += 1 @@ -313,10 +310,10 @@ class ServiceHandle(object): - (required) Service requested resources. - (required) Service requested controller resources. - (required) Whether the service have auto restart enabled. + - (required) Controller port. + - (required) LoadBalancer port. - (optional) Service uptime. - (optional) Service endpoint IP. - - (optional) Controller port. - - (optional) LoadBalancer port. - (optional) Controller and LoadBalancer job id. - (optional) Ephemeral storage generated for the service. @@ -332,31 +329,29 @@ def __init__( requested_resources: 'sky.Resources', requested_controller_resources: 'sky.Resources', auto_restart: bool, + controller_port: int, + load_balancer_port: int, uptime: Optional[int] = None, endpoint_ip: Optional[str] = None, - controller_port: Optional[int] = None, - load_balancer_port: Optional[int] = None, job_id: Optional[int] = None, ephemeral_storage: Optional[List[Dict[str, Any]]] = None, ) -> None: self._version = self._VERSION self.service_name = service_name - self.uptime = uptime - self.endpoint_ip = endpoint_ip self.policy = policy self.requested_resources = requested_resources self.requested_controller_resources = requested_controller_resources self.auto_restart = auto_restart self.controller_port = controller_port self.load_balancer_port = load_balancer_port + self.uptime = uptime + self.endpoint_ip = endpoint_ip self.job_id = job_id self.ephemeral_storage = ephemeral_storage def __repr__(self) -> str: return ('ServiceHandle(' f'\n\tservice_name={self.service_name},' - f'\n\tuptime={self.uptime},' - f'\n\tendpoint_ip={self.endpoint_ip},' f'\n\tpolicy={self.policy},' f'\n\trequested_resources={self.requested_resources},' '\n\trequested_controller_resources=' @@ -364,6 +359,8 @@ def __repr__(self) -> str: f'\n\tauto_restart={self.auto_restart},' f'\n\tcontroller_port={self.controller_port},' f'\n\tload_balancer_port={self.load_balancer_port},' + f'\n\tuptime={self.uptime},' + f'\n\tendpoint_ip={self.endpoint_ip},' f'\n\tjob_id={self.job_id},' f'\n\tephemeral_storage={self.ephemeral_storage})') From 9bf337b4d9b31128973a02ac27d9ad6d8cdc92f0 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 11:24:37 -0700 Subject: [PATCH 087/223] remove cpu demand for gpu workloads --- sky/backends/backend_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 64fbc590475..93605ded5a5 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2914,7 +2914,8 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: assert len(task.resources) == 1, task.resources resources = list(task.resources)[0] if resources is not None and resources.accelerators is not None: - resources_dict.update(resources.accelerators) + # If any accelerator is requested, use GPU resource instead. + resources_dict = resources.accelerators return resources_dict From b8ce4c4ee0e41852087f8e7bcc8b249c29071840 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 11:28:40 -0700 Subject: [PATCH 088/223] make ReservedClusterGroup.get_group accept none arg --- sky/backends/backend_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 93605ded5a5..c6fc51fa512 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -150,12 +150,14 @@ class ReservedClusterGroup(enum.Enum): 'sky serve controller. ')) @classmethod - def get_group(cls, name: str) -> Optional['ReservedClusterGroup']: + def get_group(cls, name: Optional[str]) -> Optional['ReservedClusterGroup']: """Get the reserved group of a cluster with its name Returns the group name if the cluster name is reserved. Otherwise, returns None. """ + if name is None: + return None for group in cls: if group.value.check(name): return group @@ -2938,8 +2940,6 @@ def check_cluster_name_not_reserved( Returns: None, if the cluster name is not reserved. """ - if cluster_name is None: - return group = ReservedClusterGroup.get_group(cluster_name) if group is not None: msg = group.value.check_cluster_name_hint From 01bea22bd8ec1a2dd4f6e2c6477a75a23063e48e Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 12:23:31 -0700 Subject: [PATCH 089/223] nit --- sky/backends/cloud_vm_ray_backend.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 1cc975f9695..5d4618813ce 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -316,10 +316,10 @@ def add_gang_scheduling_placement_group_and_setup( # by default. bundles = [{'CPU': task_cpu_demand} for _ in range(num_nodes)] - if len(resources_dict) > 0: - assert len(resources_dict) == 1, \ - ('There can only be one type of accelerator per instance.' - f' Found: {resources_dict}.') + if resources_dict: + assert len(resources_dict) == 1, ( + 'There can only be one type of accelerator per instance. ' + f'Found: {resources_dict}.') acc_name = list(resources_dict.keys())[0] acc_count = list(resources_dict.values())[0] gpu_dict = {'GPU': acc_count} @@ -481,10 +481,10 @@ def add_ray_task(self, options.append(f'num_cpus={task_cpu_demand}') num_gpus = 0.0 - if len(ray_resources_dict) > 0: - assert len(ray_resources_dict) == 1, \ - ('There can only be one type of accelerator per instance.' - f' Found: {ray_resources_dict}.') + if ray_resources_dict: + assert len(ray_resources_dict) == 1, ( + 'There can only be one type of accelerator per instance. ' + f'Found: {ray_resources_dict}.') num_gpus = list(ray_resources_dict.values())[0] options.append(f'resources={json.dumps(ray_resources_dict)}') From d27dab82784c1cc35514f18ed690507cfbbc9338 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 12:23:49 -0700 Subject: [PATCH 090/223] remove yaml_only and task_only in _make_task_or_dag_from_entrypoint_with_overrides --- sky/cli.py | 27 +++++++++++++-------------- sky/task.py | 4 +++- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 0aa083c6a72..963ef90f436 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -113,6 +113,10 @@ '{cluster_num} cluster{plural} {verb}. Please specify an existing ' 'cluster to show its IP address.\nUsage: `sky status --ip `') +_DAG_NOT_SUPPORT_MESSAGE = ('YAML specifies a DAG which is only supported by ' + '`sky spot launch`. `{command}` supports a ' + 'single task only.') + def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: """Returns a list of clusters that match the glob pattern.""" @@ -1062,8 +1066,6 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]: def _make_task_or_dag_from_entrypoint_with_overrides( entrypoint: List[str], *, - yaml_only: bool = False, - task_only: bool = False, entrypoint_name: str = 'Task', name: Optional[str] = None, cluster: Optional[str] = None, @@ -1101,9 +1103,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides( nl=False) click.secho(entrypoint, bold=True) else: - if yaml_only: - raise click.UsageError( - f'Expected a yaml file, but got {entrypoint}.') if not entrypoint: entrypoint = None else: @@ -1134,9 +1133,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides( usage_lib.messages.usage.update_user_task_yaml(entrypoint) dag = dag_utils.load_chain_dag_from_yaml(entrypoint, env_overrides=env) if len(dag.tasks) > 1: - if task_only: - raise click.UsageError( - f'Expected a single task, but got {len(dag.tasks)} tasks.') # When the dag has more than 1 task. It is unclear how to # override the params for the dag. So we just ignore the # override params. @@ -1150,7 +1146,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides( f'If you see this, please file an issue; tasks: {dag.tasks}') task = dag.tasks[0] else: - task = sky.Task(name='sky-cmd', run=entrypoint) + task = sky.Task(name=sky.Task.CLI_CMD_TASK_NAME, run=entrypoint) task.set_resources({sky.Resources()}) # Override. @@ -1454,9 +1450,7 @@ def launch( ) if isinstance(task_or_dag, sky.Dag): raise click.UsageError( - 'YAML specifies a DAG which is only supported by ' - '`sky spot launch`. `sky launch` supports a ' - 'single task only.') + _DAG_NOT_SUPPORT_MESSAGE.format(command='sky launch')) task = task_or_dag backend: backends.Backend @@ -4114,8 +4108,13 @@ def serve_up( raise RuntimeError(prompt) task = _make_task_or_dag_from_entrypoint_with_overrides( - entrypoint, yaml_only=True, task_only=True, entrypoint_name='Service') - assert isinstance(task, sky.Task) + entrypoint, entrypoint_name='Service') + if isinstance(task, sky.Dag): + raise click.UsageError( + _DAG_NOT_SUPPORT_MESSAGE.format(command='sky serve up')) + if task.name == sky.Task.CLI_CMD_TASK_NAME: + raise click.UsageError( + 'For `sky serve up`, the entrypoint must be a YAML file.') if task.service is None: with ux_utils.print_exception_no_traceback(): diff --git a/sky/task.py b/sky/task.py index bc59bcad87c..d815c128bc7 100644 --- a/sky/task.py +++ b/sky/task.py @@ -162,6 +162,8 @@ def _add_docker_login_config(resources: 'resources_lib.Resources'): class Task: """Task: a computation to be run on the cloud.""" + CLI_CMD_TASK_NAME = 'sky-cmd' + def __init__( self, name: Optional[str] = None, @@ -1029,7 +1031,7 @@ def __rshift__(self, b): sky.dag.get_current_dag().add_edge(self, b) def __repr__(self): - if self.name and self.name != 'sky-cmd': # CLI launch with a command + if self.name and self.name != self.CLI_CMD_TASK_NAME: return self.name if isinstance(self.run, str): run_msg = self.run.replace('\n', '\\n') From c00fcdf40e1a2ea7be50a299c0c832bb36ca55cb Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 12:37:05 -0700 Subject: [PATCH 091/223] cli nits --- sky/cli.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 963ef90f436..2e7f01a35c4 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1772,6 +1772,7 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, """ # Using a pool with 1 worker to run the spot job query in parallel to speed # up. The pool provides a AsyncResult object that can be used as a future. + # TODO(tian): Show service as well. with multiprocessing.Pool(1) as pool: # Do not show spot queue if user specifies clusters, and if user # specifies --ip. @@ -2833,7 +2834,7 @@ def _down_or_stop_clusters( ] # Make sure the reserved clusters are explicitly specified without other # normal clusters. - if len(reserved_clusters) > 0: + if reserved_clusters: name2group: Dict[str, backend_utils.ReservedClusterGroup] = dict() for name in reserved_clusters: group = backend_utils.ReservedClusterGroup.get_group(name) @@ -2851,16 +2852,21 @@ def _down_or_stop_clusters( f'{reserved_clusters_str} with other cluster(s) ' f'{names_str} is currently not supported.\n' f'Please omit the reserved cluster(s) {reserved_clusters}.') + if len(reserved_clusters) > 1: + raise click.UsageError( + f'{operation} multiple reserved clusters ' + f'{reserved_clusters_str} is currently not supported.\n' + f'Please specify only one reserved cluster.') + reserved_cluster = reserved_clusters[0] if not down: raise click.UsageError( f'{operation} reserved cluster(s) ' f'{reserved_clusters_str} is currently not supported. ' f'{decline_stop_hints}') else: - for reserved_cluster in reserved_clusters: - hint_or_raise = _RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE[ - name2group[reserved_cluster]] - hint_or_raise(reserved_cluster) + hint_or_raise = _RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE[ + name2group[reserved_cluster]] + hint_or_raise(reserved_cluster) confirm_str = 'delete' user_input = click.prompt( f'To proceed, please check the information above and type ' @@ -4098,7 +4104,8 @@ def serve_up( status_lib.ServiceStatus.FAILED ]: prompt = (f'Service {service_name!r} has failed. ' - 'Please clean up the service and try again.') + 'Please clean up the service and restart: ' + f'sky serve down {service_name}') else: prompt = (f'Service {service_name!r} already exists. ' 'Updating a service will be supported in the future. ' From 2b43b83b562ae86df8c0f87c4fe1e9f2cf91aea1 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Oct 2023 12:43:51 -0700 Subject: [PATCH 092/223] remove get_glob_service_names --- sky/cli.py | 2 +- sky/global_user_state.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 2e7f01a35c4..d365d1f5e6c 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -503,7 +503,7 @@ def _complete_service_name(ctx: click.Context, param: click.Parameter, incomplete: str) -> List[str]: """Handle shell completion for service names.""" del ctx, param # Unused. - return global_user_state.get_service_names_start_with(incomplete) + return global_user_state.get_glob_service_names(f'{incomplete}*') def _complete_storage_name(ctx: click.Context, param: click.Parameter, diff --git a/sky/global_user_state.py b/sky/global_user_state.py index c59e5956829..4fd0cee0031 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -741,12 +741,6 @@ def get_cluster_names_start_with(starts_with: str) -> List[str]: return [row[0] for row in rows] -def get_service_names_start_with(starts_with: str) -> List[str]: - rows = _DB.cursor.execute('SELECT name FROM services WHERE name LIKE (?)', - (f'{starts_with}%',)) - return [row[0] for row in rows] - - def get_enabled_clouds() -> List[clouds.Cloud]: rows = _DB.cursor.execute('SELECT value FROM config WHERE key = ?', (_ENABLED_CLOUDS_KEY,)) From 65b9bb2244d9a6b500534b1da5cd15320a8dd19a Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 4 Oct 2023 09:31:39 -0700 Subject: [PATCH 093/223] fix pop CPU --- sky/backends/backend_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index c6fc51fa512..827c27b77fa 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2916,8 +2916,7 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: assert len(task.resources) == 1, task.resources resources = list(task.resources)[0] if resources is not None and resources.accelerators is not None: - # If any accelerator is requested, use GPU resource instead. - resources_dict = resources.accelerators + resources_dict.update(resources.accelerators) return resources_dict From 28f613da35c861db191a6696dcd023248bfc533d Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 4 Oct 2023 09:55:28 -0700 Subject: [PATCH 094/223] remove CPU demand for job when presented in CLI --- sky/backends/backend_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 827c27b77fa..04faffb9388 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2922,6 +2922,8 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: def get_task_resources_str(task: 'task_lib.Task') -> str: resources_dict = get_task_demands_dict(task) + if len(resources_dict) > 1: + resources_dict.pop('CPU') resources_str = ', '.join(f'{k}:{v}' for k, v in resources_dict.items()) resources_str = f'{task.num_nodes}x [{resources_str}]' return resources_str From e9734429f7bd0cc64713bd4d70ab21ba61f69aaa Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 4 Oct 2023 23:17:50 -0700 Subject: [PATCH 095/223] remove cancel and use os.kill now --- sky/core.py | 11 ----------- sky/serve/controller.py | 31 +++++++++++++++++++++++++++++++ sky/serve/load_balancer.py | 13 +++++++++++++ 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/sky/core.py b/sky/core.py index e466ebc0a4f..5665136bca2 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1190,17 +1190,6 @@ def serve_down(service_name: str, purge: bool = False) -> None: backend = backends.CloudVmRayBackend() backend.register_info(minimize_logging=True) - # Cancel the controller and load balancer jobs. - # For the case when controller / load_balancer job failed to submit. - jobs = [] - if service_handle.job_id is not None: - jobs.append(service_handle.job_id) - - try: - backend.cancel_jobs(handle, jobs=jobs) - except exceptions.FetchIPError as e: - raise RuntimeError(controller_fetch_ip_error_message) from e - # Cleanup all files on controller related to this service. # We have a 10-min grace period for the controller to autostop, # so it should be fine if this is the last service on the diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 5d3f9b4184a..5bcc149e9ff 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -6,7 +6,11 @@ import asyncio import base64 import logging +import os import pickle +import signal +import threading +import time from typing import Optional import fastapi @@ -47,8 +51,20 @@ def __init__(self, self.port = port self.infra_provider = infra_provider self.autoscaler = autoscaler + self.terminating = False + self.load_balancer_received_terminal_signal = False self.app = fastapi.FastAPI() + def _check_terminate(self): + while True: + if self.terminating and self.load_balancer_received_terminal_signal: + # 1s grace period for the rare case that terminate is set but + # return of /terminate request is not ready yet. + time.sleep(1) + logger.info('Terminate controller...') + os.kill(os.getpid(), signal.SIGINT) + time.sleep(10) + def run(self) -> None: @self.app.post('/controller/update_num_requests') @@ -72,6 +88,10 @@ def get_autoscaler_query_interval(): def get_ready_replicas(): return {'ready_replicas': self.infra_provider.get_ready_replicas()} + @self.app.get('/controller/is_terminating') + def is_terminating(): + return {'is_terminating': self.terminating} + @self.app.get('/controller/get_latest_info') def get_latest_info(): latest_info = { @@ -93,6 +113,10 @@ def terminate(request: fastapi.Request): logger.info('Terminate autoscaler...') self.autoscaler.terminate() msg = self.infra_provider.terminate() + if msg is None: + # We cannot terminate the controller now because we still + # need the output of this request to be sent back. + self.terminating = True return {'message': msg} # Run replica_prober and autoscaler (if autoscaler is defined) @@ -102,6 +126,13 @@ def terminate(request: fastapi.Request): if self.autoscaler is not None: self.autoscaler.start() + # Start a daemon to check if the controller is terminating, and if so, + # shutdown the controller so the skypilot jobs will finish, thus enable + # the controller VM to autostop. + terminate_checking_daemon = threading.Thread( + target=self._check_terminate, daemon=True) + terminate_checking_daemon.start() + # Disable all GET logs if SKYPILOT_DEBUG is not set to avoid overflowing # the controller logs. if not env_options.Options.SHOW_DEBUG_INFO.get(): diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 7c5317ae941..c1e661c8faa 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -1,5 +1,7 @@ """LoadBalancer: redirect any incoming request to an endpoint replica.""" import argparse +import os +import signal import threading import time @@ -60,6 +62,17 @@ def _sync_with_controller(self): while True: with requests.Session() as session: try: + # TODO(tian): Maybe merge all of them into one request? + # check if the controller is terminating. If so, shut down + # the load balancer so the skypilot jobs will finish, thus + # enable the controller VM to autostop. + response = session.get(self.controller_url + + '/controller/is_terminating') + response.raise_for_status() + if bool(response.json()['is_terminating']): + logger.info('Controller is terminating. ' + 'Shutting down load balancer.') + os.kill(os.getpid(), signal.SIGINT) # send request num in last query interval response = session.post( self.controller_url + '/controller/update_num_requests', From 1b31110bdd35213c359e1b77efd30fa776c71191 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 5 Oct 2023 13:49:45 -0700 Subject: [PATCH 096/223] add db on controller VM, remove job id and use skylet to refresh service status --- sky/backends/backend.py | 11 +--- sky/backends/backend_utils.py | 64 ++++++++----------- sky/backends/cloud_vm_ray_backend.py | 24 +++---- sky/backends/local_docker_backend.py | 14 ++-- sky/cli.py | 33 ++-------- sky/core.py | 17 +---- sky/execution.py | 66 +++---------------- sky/global_user_state.py | 25 ++------ sky/serve/__init__.py | 2 - sky/serve/constants.py | 3 - sky/serve/controller.py | 16 ++++- sky/serve/infra_providers.py | 16 ++--- sky/serve/serve_state.py | 95 ++++++++++++++++++++++++++++ sky/serve/serve_utils.py | 78 +++++++++++++++++++---- sky/skylet/constants.py | 2 +- sky/skylet/events.py | 9 +++ sky/skylet/skylet.py | 1 + sky/status_lib.py | 4 ++ sky/task.py | 4 +- sky/utils/cli_utils/status_utils.py | 3 +- 20 files changed, 266 insertions(+), 221 deletions(-) create mode 100644 sky/serve/serve_state.py diff --git a/sky/backends/backend.py b/sky/backends/backend.py index 69e200e1432..28aa981b078 100644 --- a/sky/backends/backend.py +++ b/sky/backends/backend.py @@ -86,14 +86,7 @@ def execute(self, handle: _ResourceHandleType, task: 'task_lib.Task', detach_run: bool, - dryrun: bool = False) -> Optional[int]: - """Execute a job on the cluster. - - Returns: - The job id if the job is submitted successfully, None otherwise. - Job id is a CloudVMRayBackend-specific concept, so all other - backends should return None. - """ + dryrun: bool = False) -> None: usage_lib.record_cluster_name_for_current_operation( handle.get_cluster_name()) usage_lib.messages.usage.update_actual_task(task) @@ -150,7 +143,7 @@ def _execute(self, handle: _ResourceHandleType, task: 'task_lib.Task', detach_run: bool, - dryrun: bool = False) -> Optional[int]: + dryrun: bool = False) -> None: raise NotImplementedError def _post_execute(self, handle: _ResourceHandleType, down: bool) -> None: diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 04faffb9388..6a3d882c379 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,5 +1,4 @@ """Util constants/functions for the backends.""" -import copy import dataclasses from datetime import datetime import difflib @@ -2716,6 +2715,17 @@ def _refresh_cluster(cluster_name): return kept_records +def _add_default_value_to_local_record( + record: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: + # NOTE(dev): Keep this align with sky.serve.controller.get_latest_info + if record is None: + return record + record['status'] = status_lib.ServiceStatus.UNKNOWN + record['uptime'] = None + record['replica_info'] = [] + return record + + def _refresh_service_record_no_lock( service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: """Refresh the service, and return the possibly updated record. @@ -2727,30 +2737,24 @@ def _refresh_service_record_no_lock( A tuple of a possibly updated record and an error message if any error occurred when refreshing the service. """ - local_record = global_user_state.get_service_from_name(service_name) - if local_record is None: + record = global_user_state.get_service_from_name(service_name) + if record is None: return None, None - - # We use a copy of the record with default value of replica_info to return - # when there is an error. - record = copy.deepcopy(local_record) - record['replica_info'] = [] - service_handle: serve_lib.ServiceHandle = local_record['handle'] + _add_default_value_to_local_record(record) try: check_network_connection() except exceptions.NetworkError: return record, 'Failed to refresh replica info due to network error.' + service_handle: serve_lib.ServiceHandle = record['handle'] if not service_handle.endpoint_ip: # Service controller is still initializing. Skipped refresh status. + record['status'] = status_lib.ServiceStatus.CONTROLLER_INIT return record, None - controller_name = local_record['controller_name'] + controller_name = record['controller_name'] cluster_record = global_user_state.get_cluster_from_name(controller_name) - - # We don't check controller status here since it might be in INIT status - # when other services is starting and launching the controller. assert cluster_record is not None handle = cluster_record['handle'] @@ -2759,37 +2763,18 @@ def _refresh_service_record_no_lock( code = serve_lib.ServeCodeGen.get_latest_info( service_handle.controller_port) - returncode, latest_info_payload, _ = backend.run_on_head( + returncode, latest_info_payload, stderr = backend.run_on_head( handle, code, require_outputs=True, stream_logs=False, separate_stderr=True) if returncode != 0: - global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - return record, None + return record, stderr latest_info = serve_lib.load_latest_info(latest_info_payload) - service_handle.uptime = latest_info['uptime'] - - # When the service is shutting down, there is a period of time which the - # controller still responds to the request, and the replica is not - # terminated, so the return value for _service_status_from_replica_info - # will still be READY, but we don't want change service status to READY. - # For controller init, there is a small chance that the controller is - # running but the load balancer is not. In this case, the service status - # shouldn't be refreshed too. - if local_record['status'] not in [ - status_lib.ServiceStatus.SHUTTING_DOWN, - status_lib.ServiceStatus.CONTROLLER_INIT, - ]: - local_record['status'] = serve_lib.replica_info_to_service_status( - latest_info['replica_info']) - - global_user_state.add_or_update_service(**local_record) - local_record['replica_info'] = latest_info['replica_info'] - return local_record, None + record.update(latest_info) + return record, None def _refresh_service_record( @@ -2804,7 +2789,8 @@ def _refresh_service_record( except filelock.Timeout: msg = ('Failed get the lock for service ' f'{service_name!r}. Using the cached record.') - return global_user_state.get_service_from_name(service_name), msg + return _add_default_value_to_local_record( + global_user_state.get_service_from_name(service_name)), msg # TODO(tian): Maybe aggregate services using same controller to reduce SSH @@ -2905,8 +2891,8 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: resources_dict = { # We set CPU resource for sky serve controller to a smaller value # to support a larger number of services. - 'CPU': (serve_lib.SERVICES_TASK_CPU_DEMAND if - task.is_sky_serve_controller_task else DEFAULT_TASK_CPU_DEMAND) + 'CPU': (serve_lib.SERVICES_TASK_CPU_DEMAND + if task.service_handle is not None else DEFAULT_TASK_CPU_DEMAND) } if task.best_resources is not None: resources = task.best_resources diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 211fe677c16..a842ece13fe 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3241,6 +3241,7 @@ def _exec_code_on_head( executable: str, detach_run: bool = False, spot_dag: Optional['dag.Dag'] = None, + service_handle: Optional['serve_lib.ServiceHandle'] = None, ) -> None: """Executes generated code on the head node.""" style = colorama.Style @@ -3311,6 +3312,11 @@ def _exec_code_on_head( # the controller process job, as it will stay in the job pending # table and not be executed until there is an empty slot. job_submit_cmd = job_submit_cmd + ' && ' + spot_code + if service_handle is not None: + # Add the service to service table on controller VM. + serve_code = serve_lib.ServeCodeGen.add_service( + job_id, service_handle) + job_submit_cmd = job_submit_cmd + ' && ' + serve_code returncode, stdout, stderr = self.run_on_head(handle, job_submit_cmd, @@ -3446,15 +3452,10 @@ def _execute( task: task_lib.Task, detach_run: bool, dryrun: bool = False, - ) -> Optional[int]: - """Execute a job on the cluster. - - Returns: - The job id if the job is submitted successfully, None otherwise. - """ + ) -> None: if task.run is None: logger.info('Run commands not specified or empty.') - return None + return # Check the task resources vs the cluster resources. Since `sky exec` # will not run the provision and _check_existing_cluster # We need to check ports here since sky.exec shouldn't change resources @@ -3464,7 +3465,7 @@ def _execute( if dryrun: logger.info(f'Dryrun complete. Would have run:\n{task}') - return None + return job_id = self._add_job(handle, task.name, resources_str) @@ -3475,7 +3476,6 @@ def _execute( else: # Case: task_lib.Task(run, num_nodes=1) self._execute_task_one_node(handle, task, job_id, detach_run) - return job_id def _post_execute(self, handle: CloudVmRayResourceHandle, down: bool) -> None: @@ -4692,7 +4692,8 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, job_id, executable='python3', detach_run=detach_run, - spot_dag=task.spot_dag) + spot_dag=task.spot_dag, + service_handle=task.service_handle) def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, task: task_lib.Task, job_id: int, @@ -4766,4 +4767,5 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, job_id, executable='python3', detach_run=detach_run, - spot_dag=task.spot_dag) + spot_dag=task.spot_dag, + service_handle=task.service_handle) diff --git a/sky/backends/local_docker_backend.py b/sky/backends/local_docker_backend.py index e9d8b8add06..cfdd432eda3 100644 --- a/sky/backends/local_docker_backend.py +++ b/sky/backends/local_docker_backend.py @@ -268,13 +268,8 @@ def _execute(self, handle: LocalDockerResourceHandle, task: 'task_lib.Task', detach_run: bool, - dryrun: bool = False) -> Optional[int]: - """Launches the container. - - Returns: - The job id if the job is submitted successfully. LocalDockerBackend - does not have the concept of job id, so this is always None. - """ + dryrun: bool = False) -> None: + """ Launches the container.""" if detach_run: raise NotImplementedError('detach_run=True is not supported in ' 'LocalDockerBackend.') @@ -287,14 +282,13 @@ def _execute(self, # Handle a basic task if task.run is None: logger.info(f'Nothing to run; run command not specified:\n{task}') - return None + return if dryrun: logger.info(f'Dryrun complete. Would have run:\n{task}') - return None + return self._execute_task_one_node(handle, task) - return None def _post_execute(self, handle: LocalDockerResourceHandle, down: bool) -> None: diff --git a/sky/cli.py b/sky/cli.py index 7033ef1ba39..58915d18ae8 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4103,18 +4103,10 @@ def serve_up( previous_service_record = global_user_state.get_service_from_name( service_name) if previous_service_record is not None: - if previous_service_record['status'] in [ - status_lib.ServiceStatus.CONTROLLER_FAILED, - status_lib.ServiceStatus.FAILED - ]: - prompt = (f'Service {service_name!r} has failed. ' - 'Please clean up the service and restart: ' - f'sky serve down {service_name}') - else: - prompt = (f'Service {service_name!r} already exists. ' - 'Updating a service will be supported in the future. ' - 'For now, clean up the service and restart: ' - f'sky serve down {service_name}') + prompt = (f'Service {service_name!r} already exists. ' + 'Updating a service will be supported in the future. ' + 'For now, clean up the service and restart: ' + f'sky serve down {service_name}') with ux_utils.print_exception_no_traceback(): raise RuntimeError(prompt) @@ -4245,12 +4237,12 @@ def serve_status(all: bool, service_names: List[str]): # Only show status of my-service sky serve status my-service """ - click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' - f'{colorama.Style.RESET_ALL}') query_services: Optional[List[str]] = None if service_names: query_services = _get_glob_services(service_names) service_records = core.serve_status(query_services) + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' + f'{colorama.Style.RESET_ALL}') status_utils.show_service_table(service_records, all) click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Replicas{colorama.Style.RESET_ALL}') @@ -4268,19 +4260,6 @@ def serve_status(all: bool, service_names: List[str]): replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) - failed_controllers = [ - record['name'] - for record in service_records - if record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED - ] - if failed_controllers: - num_failed = len(failed_controllers) - plural = '' if num_failed == 1 else 's' - click.echo( - f'\n* {num_failed} service{plural} with failed controller found. ' - 'Please manually check if there is any leaked resources for ' - f'services: {", ".join(failed_controllers)}.') - @serve.command('down', cls=_DocumentedCodeCommand) @click.argument('service_names', diff --git a/sky/core.py b/sky/core.py index 5665136bca2..578811acdfe 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1074,15 +1074,6 @@ def serve_tail_logs( with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service {service_name!r} does not exist. ' 'Cannot stream logs.') - if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'Service {service_name!r} is still initializing its ' - 'controller. Please try again later.') - if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name!r}\'s controller failed. ' - 'Cannot tail logs.') service_handle: serve.ServiceHandle = service_record['handle'] controller_name = service_record['controller_name'] handle = global_user_state.get_handle_from_cluster_name(controller_name) @@ -1119,8 +1110,6 @@ def serve_down(service_name: str, purge: bool = False) -> None: service_handle: serve.ServiceHandle = service_record['handle'] controller_name = service_record['controller_name'] - global_user_state.set_service_status(service_name, - status_lib.ServiceStatus.SHUTTING_DOWN) handle = global_user_state.get_handle_from_cluster_name(controller_name) controller_fetch_ip_error_message = ( @@ -1194,7 +1183,8 @@ def serve_down(service_name: str, purge: bool = False) -> None: # We have a 10-min grace period for the controller to autostop, # so it should be fine if this is the last service on the # controller and its job is the only one running. - code = serve.ServeCodeGen.cleanup_service_files(service_name) + # Also, Cleanup the service record in controller VM + code = serve.ServeCodeGen.cleanup_service(service_name) returncode, _, stderr = backend.run_on_head(handle, code, require_outputs=True, @@ -1202,8 +1192,7 @@ def serve_down(service_name: str, purge: bool = False) -> None: separate_stderr=True) subprocess_utils.handle_returncode( returncode, - code, ('Failed when cleaning up service files on controller ' - f'of service {service_name!r}'), + code, ('Failed when cleaning up service {service_name!r}'), stderr, stream_logs=False) diff --git a/sky/execution.py b/sky/execution.py index ea9721e5048..6b86284347c 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -25,14 +25,12 @@ from sky import sky_logging from sky import skypilot_config from sky import spot -from sky import status_lib from sky import task as task_lib from sky.backends import backend_utils from sky.clouds import gcp from sky.data import data_utils from sky.data import storage as storage_lib from sky.skylet import constants -from sky.skylet import job_lib from sky.usage import usage_lib from sky.utils import common_utils from sky.utils import dag_utils @@ -174,7 +172,7 @@ def _execute( # Internal only: # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, -) -> Optional[int]: +) -> None: """Execute an entrypoint. If sky.Task is given or DAG has not been optimized yet, this will call @@ -212,10 +210,6 @@ def _execute( idle_minutes_to_autostop: int; if provided, the cluster will be set to autostop after this many minutes of idleness. no_setup: bool; whether to skip setup commands or not when (re-)launching. - - Returns: - A job id (int) if the job is submitted successfully and backend is - CloudVmRayBackend, otherwise None. """ dag = _convert_to_dag(entrypoint) assert len(dag) == 1, f'We support 1 task for now. {dag}' @@ -321,7 +315,6 @@ def _execute( # Optimizer should eventually choose where to store bucket task.sync_storage_mounts() - job_id = None try: if Stage.PROVISION in stages: if handle is None: @@ -334,7 +327,7 @@ def _execute( if dryrun and handle is None: logger.info('Dryrun finished.') - return None + return if Stage.SYNC_WORKDIR in stages and not dryrun: if task.workdir is not None: @@ -359,10 +352,7 @@ def _execute( if Stage.EXEC in stages: try: global_user_state.update_last_use(handle.get_cluster_name()) - job_id = backend.execute(handle, - task, - detach_run, - dryrun=dryrun) + backend.execute(handle, task, detach_run, dryrun=dryrun) finally: # Enables post_execute() to be run after KeyboardInterrupt. backend.post_execute(handle, down) @@ -387,7 +377,6 @@ def _execute( subprocess_utils.run('sky status --no-show-spot-jobs', env=env) print() print('\x1b[?25h', end='') # Show cursor. - return job_id @timeline.event @@ -1047,8 +1036,7 @@ def serve_up( service_name, launched_at=int(time.time()), controller_name=controller_name, - handle=service_handle, - status=status_lib.ServiceStatus.CONTROLLER_INIT) + handle=service_handle) controller_resources = controller_resources.copy( ports=[load_balancer_port]) except filelock.Timeout as e: @@ -1103,10 +1091,10 @@ def serve_up( controller_task = task_lib.Task.from_yaml(controller_yaml_path) controller_task.set_resources(controller_resources) - # Set this flag to modify default ray task CPU usage to custom value + # Set this to modify default ray task CPU usage to custom value # instead of default 0.5 vCPU. We need to set it to a smaller value # to support a larger number of services. - controller_task.is_sky_serve_controller_task = True + controller_task.service_handle = service_handle controller_task.update_envs(_shared_controller_env_vars()) @@ -1114,7 +1102,7 @@ def serve_up( style = colorama.Style print(f'\n{fore.YELLOW}Launching controller for {service_name!r}...' f'{style.RESET_ALL}') - job_id = _execute( + _execute( entrypoint=controller_task, stream_logs=False, cluster_name=controller_name, @@ -1129,53 +1117,15 @@ def serve_up( controller_record = global_user_state.get_cluster_from_name( controller_name) - if controller_record is None: - global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'Controller failed to launch. Please check the logs above.') + assert controller_record is not None handle = controller_record['handle'] assert isinstance(handle, backends.CloudVmRayResourceHandle) - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend), backend - backend.register_info(minimize_logging=True) service_handle.endpoint_ip = handle.head_ip global_user_state.set_service_handle(service_name, service_handle) - def _wait_until_job_is_running_on_controller( - job_id: Optional[int]) -> bool: - if job_id is None: - return False - for _ in range(serve.SERVE_STARTUP_TIMEOUT): - job_statuses = backend.get_job_status(handle, [job_id], - stream_logs=False) - job_status = job_statuses.get(job_id, None) - if job_status == job_lib.JobStatus.RUNNING: - return True - time.sleep(1) - # Cancel any jobs that are still pending after timeout. - if job_status == job_lib.JobStatus.PENDING: - backend.cancel_jobs(handle, jobs=[job_id]) - return False - - if not _wait_until_job_is_running_on_controller(job_id): - global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.CONTROLLER_FAILED) - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'Controller failed to launch. Please check ' - f'the logs with sky serve logs {service_name} ' - '--controller') - - service_handle.job_id = job_id - global_user_state.set_service_handle(service_name, service_handle) print(f'{fore.GREEN}Launching controller for {service_name!r}...done.' f'{style.RESET_ALL}') - global_user_state.set_service_status( - service_name, status_lib.ServiceStatus.REPLICA_INIT) - print(f'\n{fore.CYAN}Service name: ' f'{style.BRIGHT}{service_name}{style.RESET_ALL}' '\nTo see detailed info:' diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 4fd0cee0031..09dc6cc07b8 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -99,8 +99,7 @@ def create_table(cursor, conn): name TEXT PRIMARY KEY, launched_at INTEGER, controller_name TEXT, - handle BLOB, - status TEXT)""") + handle BLOB)""") # For backward compatibility. # TODO(zhwu): Remove this function after all users have migrated to # the latest version of SkyPilot. @@ -282,11 +281,10 @@ def add_or_update_cluster(cluster_name: str, def add_or_update_service(name: str, launched_at: int, controller_name: str, - handle: 'serve.ServiceHandle', - status: status_lib.ServiceStatus) -> None: + handle: 'serve.ServiceHandle') -> None: _DB.cursor.execute( 'INSERT or REPLACE INTO services' - '(name, launched_at, controller_name, handle, status) ' + '(name, launched_at, controller_name, handle) ' 'VALUES (' # name '?, ' @@ -295,8 +293,6 @@ def add_or_update_service(name: str, launched_at: int, controller_name: str, # controller_name '?, ' # handle - '?, ' - # status '?' ')', ( @@ -308,8 +304,6 @@ def add_or_update_service(name: str, launched_at: int, controller_name: str, controller_name, # handle pickle.dumps(handle), - # status - status.value, )) _DB.conn.commit() @@ -361,16 +355,6 @@ def remove_service(service_name: str): _DB.conn.commit() -def set_service_status(service_name: str, status: status_lib.ServiceStatus): - _DB.cursor.execute('UPDATE services SET status=(?) ' - 'WHERE name=(?)', (status.value, service_name)) - count = _DB.cursor.rowcount - _DB.conn.commit() - assert count <= 1, count - if count == 0: - raise ValueError(f'Service {service_name} not found.') - - def set_service_handle(service_name: str, handle: 'serve.ServiceHandle'): _DB.cursor.execute('UPDATE services SET handle=(?) ' 'WHERE name=(?)', (pickle.dumps(handle), service_name)) @@ -613,14 +597,13 @@ def _get_service_from_row(row) -> Dict[str, Any]: # Explicitly specify the number of fields to unpack, so that # we can add new fields to the database in the future without # breaking the previous code. - name, launched_at, controller_name, handle, status = row[:5] + name, launched_at, controller_name, handle = row[:4] # TODO: use namedtuple instead of dict return { 'name': name, 'launched_at': launched_at, 'controller_name': controller_name, 'handle': pickle.loads(handle), - 'status': status_lib.ServiceStatus[status], } diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index e05b1711593..4a2b475d292 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -9,7 +9,6 @@ from sky.serve.constants import CONTROLLER_SYNC_INTERVAL from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.constants import SERVE_PREFIX -from sky.serve.constants import SERVE_STARTUP_TIMEOUT from sky.serve.constants import SERVICE_NAME_VALID_REGEX from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.serve_utils import gen_ports_for_serve_process @@ -22,7 +21,6 @@ from sky.serve.serve_utils import get_available_controller_name from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import load_terminate_service_result -from sky.serve.serve_utils import replica_info_to_service_status from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceHandle from sky.serve.service_spec import SkyServiceSpec diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 993161fc716..ed6b8d6778f 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -37,9 +37,6 @@ # TODO(tian): Expose this option to users in yaml file. READINESS_PROBE_TIMEOUT = 15 -# The time to wait for a service to start up when we start a service. -SERVE_STARTUP_TIMEOUT = 60 - # The default controller resources. # We need 200 GB disk space to enable using Azure as controller, since its image # size is 150 GB. Also, we need 32 GB memory to run our controller and load diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 5bcc149e9ff..aedf09e3806 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -19,8 +19,10 @@ from sky import authentication from sky import serve from sky import sky_logging +from sky import status_lib from sky.serve import autoscalers from sky.serve import infra_providers +from sky.serve import serve_state from sky.utils import env_options # Use the explicit logger name so that the logger is under the @@ -90,14 +92,24 @@ def get_ready_replicas(): @self.app.get('/controller/is_terminating') def is_terminating(): + if self.terminating: + self.load_balancer_received_terminal_signal = True return {'is_terminating': self.terminating} @self.app.get('/controller/get_latest_info') def get_latest_info(): + # NOTE(dev): Keep this align with + # sky.backends.backend_utils._add_default_value_to_local_record + record = serve_state.get_service_from_name( + self.infra_provider.service_name) + if record is None: + record = {} latest_info = { 'replica_info': self.infra_provider.get_replica_info(verbose=True), - 'uptime': self.infra_provider.get_uptime(), + 'uptime': record.get('uptime', None), + 'status': record.get('status', + status_lib.ServiceStatus.UNKNOWN), } latest_info = { k: base64.b64encode(pickle.dumps(v)).decode('utf-8') @@ -109,6 +121,8 @@ def get_latest_info(): def terminate(request: fastapi.Request): del request logger.info('Terminating service...') + serve_state.set_status(self.infra_provider.service_name, + status_lib.ServiceStatus.SHUTTING_DOWN) if self.autoscaler is not None: logger.info('Terminate autoscaler...') self.autoscaler.terminate() diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 102ef8e3713..78bf9349b4d 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -16,6 +16,7 @@ from sky import global_user_state from sky import status_lib from sky.serve import constants +from sky.serve import serve_state from sky.serve import serve_utils from sky.skylet import job_lib from sky.utils import env_options @@ -194,17 +195,19 @@ class InfraProvider: def __init__( self, + service_name: str, controller_port: int, readiness_suffix: str, initial_delay_seconds: int, post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: - self.replica_info: serve_utils.ThreadSafeDict[ - str, ReplicaInfo] = serve_utils.ThreadSafeDict() + self.service_name: str = service_name self.controller_port = controller_port self.readiness_suffix: str = readiness_suffix self.initial_delay_seconds: int = initial_delay_seconds self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data self.uptime: Optional[float] = None + self.replica_info: serve_utils.ThreadSafeDict[ + str, ReplicaInfo] = serve_utils.ThreadSafeDict() logger.info(f'Readiness probe suffix: {self.readiness_suffix}') logger.info(f'Initial delay seconds: {self.initial_delay_seconds}') logger.info(f'Post data: {self.post_data} ({type(self.post_data)})') @@ -213,9 +216,6 @@ def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: # Get replica info for all replicas raise NotImplementedError - def get_uptime(self) -> Optional[float]: - return self.uptime - def total_replica_num(self, count_failed_replica: bool) -> int: # Returns the total number of replicas raise NotImplementedError @@ -244,11 +244,9 @@ def start_replica_prober(self) -> None: class SkyPilotInfraProvider(InfraProvider): """Infra provider for SkyPilot clusters.""" - def __init__(self, task_yaml_path: str, service_name: str, *args, - **kwargs) -> None: + def __init__(self, task_yaml_path: str, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.task_yaml_path: str = task_yaml_path - self.service_name: str = service_name self.next_replica_id: int = 1 self.launch_process_pool: serve_utils.ThreadSafeDict[ str, subprocess.Popen] = serve_utils.ThreadSafeDict() @@ -602,6 +600,8 @@ def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: logger.info(f'Replica {replica_ip} is the first ' 'ready replica. Setting uptime to ' f'{self.uptime}.') + serve_state.set_uptime(self.service_name, + int(self.uptime)) return info.cluster_name, True except requests.exceptions.RequestException as e: logger.info(e) diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py new file mode 100644 index 00000000000..ae139420c13 --- /dev/null +++ b/sky/serve/serve_state.py @@ -0,0 +1,95 @@ +"""The database for services information.""" +import pathlib +import sqlite3 +from typing import Any, Dict, List, Optional + +from sky import status_lib +from sky.serve import constants +from sky.utils import db_utils + +_DB_PATH = pathlib.Path(constants.SERVE_PREFIX) / 'services.db' +_DB_PATH = _DB_PATH.expanduser().absolute() +_DB_PATH.parents[0].mkdir(parents=True, exist_ok=True) +_DB_PATH = str(_DB_PATH) + +# Module-level connection/cursor; thread-safe as the module is only imported +# once. +_CONN = sqlite3.connect(_DB_PATH) +_CURSOR = _CONN.cursor() + +_CURSOR.execute("""\ + CREATE TABLE IF NOT EXISTS services ( + name TEXT PRIMARY KEY, + controller_job_id INTEGER, + controller_port INTEGER, + status TEXT, + uptime INTEGER DEFAULT NULL)""") +_CONN.commit() + + +def add_service(job_id: int, service_name: str, controller_port: int) -> None: + """Adds a service to the database.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute( + """\ + INSERT INTO services + (name, controller_job_id, controller_port, status) + VALUES (?, ?, ?, ?)""", + (service_name, job_id, controller_port, + status_lib.ServiceStatus.CONTROLLER_INIT.value)) + + +def remove_service(service_name: str) -> None: + """Removes a service from the database.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute("""\ + DELETE FROM services WHERE name=(?)""", (service_name,)) + + +def set_uptime(service_name: str, uptime: int) -> None: + """Sets the uptime of a service.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute( + """\ + UPDATE services SET + uptime=(?) WHERE name=(?)""", (uptime, service_name)) + + +def set_status(service_name: str, status: status_lib.ServiceStatus) -> None: + """Sets the service status.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute( + """\ + UPDATE services SET + status=(?) WHERE name=(?)""", (status.value, service_name)) + + +def _get_service_from_row(row) -> Dict[str, Any]: + name, controller_job_id, controller_port, status, uptime = row[:5] + return { + 'name': name, + 'controller_job_id': controller_job_id, + 'controller_port': controller_port, + 'status': status_lib.ServiceStatus[status], + 'uptime': uptime, + } + + +def get_services() -> List[Dict[str, Any]]: + """Get all existing service records.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + rows = cursor.execute('SELECT * FROM services').fetchall() + records = [] + for row in rows: + records.append(_get_service_from_row(row)) + return records + + +def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]: + """Get all existing service records.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + rows = cursor.execute('SELECT * FROM services WHERE name=(?)', + (service_name,)).fetchall() + for row in rows: + return _get_service_from_row(row) + return None diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 6a4b3de62ef..2e189776e70 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -21,6 +21,8 @@ from sky import status_lib from sky.data import storage as storage_lib from sky.serve import constants +from sky.serve import serve_state +from sky.skylet import job_lib from sky.utils import common_utils if typing.TYPE_CHECKING: @@ -312,7 +314,6 @@ class ServiceHandle(object): - (required) Whether the service have auto restart enabled. - (required) Controller port. - (required) LoadBalancer port. - - (optional) Service uptime. - (optional) Service endpoint IP. - (optional) Controller and LoadBalancer job id. - (optional) Ephemeral storage generated for the service. @@ -331,9 +332,7 @@ def __init__( auto_restart: bool, controller_port: int, load_balancer_port: int, - uptime: Optional[int] = None, endpoint_ip: Optional[str] = None, - job_id: Optional[int] = None, ephemeral_storage: Optional[List[Dict[str, Any]]] = None, ) -> None: self._version = self._VERSION @@ -344,9 +343,7 @@ def __init__( self.auto_restart = auto_restart self.controller_port = controller_port self.load_balancer_port = load_balancer_port - self.uptime = uptime self.endpoint_ip = endpoint_ip - self.job_id = job_id self.ephemeral_storage = ephemeral_storage def __repr__(self) -> str: @@ -359,9 +356,7 @@ def __repr__(self) -> str: f'\n\tauto_restart={self.auto_restart},' f'\n\tcontroller_port={self.controller_port},' f'\n\tload_balancer_port={self.load_balancer_port},' - f'\n\tuptime={self.uptime},' f'\n\tendpoint_ip={self.endpoint_ip},' - f'\n\tjob_id={self.job_id},' f'\n\tephemeral_storage={self.ephemeral_storage})') def cleanup_ephemeral_storage(self) -> None: @@ -407,6 +402,19 @@ def load_terminate_service_result(payload: str) -> Any: return terminate_resp +def check_service_status_healthy(service_name: str) -> Optional[str]: + service_record = serve_state.get_service_from_name(service_name) + if service_record is None: + return f'Service {service_name!r} does not exist.' + if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: + return (f'Service {service_name!r} is still initializing its ' + 'controller. Please try again later.') + if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED: + return (f'Service {service_name!r}\'s controller failed. ' + 'Cannot tail logs.') + return None + + def _follow_replica_logs( file: TextIO, cluster_name: str, @@ -475,6 +483,9 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool, skip_local_log_file_check: bool = False) -> str: + msg = check_service_status_healthy(service_name) + if msg is not None: + return msg print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process ' f'of replica {replica_id}.{colorama.Style.RESET_ALL}') local_log_file_name = generate_replica_local_log_file_name( @@ -565,7 +576,10 @@ def _follow_logs(file: TextIO, exit_if_stream_end: bool) -> Iterator[str]: def stream_serve_process_logs(service_name: str, stream_controller: bool, - follow: bool) -> None: + follow: bool) -> str: + msg = check_service_status_healthy(service_name) + if msg is not None: + return msg if stream_controller: log_file = generate_remote_controller_log_file_name(service_name) else: @@ -573,9 +587,10 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool, with open(os.path.expanduser(log_file), 'r', newline='') as f: for line in _follow_logs(f, exit_if_stream_end=not follow): print(line, end='', flush=True) + return '' -def cleanup_service_files(service_name: str) -> None: +def cleanup_service_utility_files(service_name: str) -> None: """Cleanup utility files for a service.""" dir_name = generate_remote_service_dir_name(service_name) dir_name = os.path.expanduser(dir_name) @@ -583,6 +598,32 @@ def cleanup_service_files(service_name: str) -> None: shutil.rmtree(dir_name) +def refresh_service_status() -> None: + services = serve_state.get_services() + for record in services: + controller_status = job_lib.get_status(record['controller_job_id']) + if controller_status is None or controller_status.is_terminal(): + # If controller job is not running, set it as controller failed. + serve_state.set_status(record['name'], + status_lib.ServiceStatus.CONTROLLER_FAILED) + elif record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: + if controller_status == job_lib.JobStatus.RUNNING: + # If controller job is running, update the status to + # REPLICA_INIT. + serve_state.set_status(record['name'], + status_lib.ServiceStatus.REPLICA_INIT) + # When the service is shutting down, there is a period of time which the + # controller still responds to the request, and the replica is not + # terminated, so the return value for _service_status_from_replica_info + # will still be READY, but we don't want change service status to READY. + elif record['status'] != status_lib.ServiceStatus.SHUTTING_DOWN: + latest_info = load_latest_info( + get_latest_info(record['controller_port'])) + serve_state.set_status( + record['name'], + replica_info_to_service_status(latest_info['replica_info'])) + + class ServeCodeGen: """Code generator for SkyServe. @@ -590,9 +631,19 @@ class ServeCodeGen: >> code = ServeCodeGen.get_latest_info(controller_port) """ _PREFIX = [ + 'from sky.serve import serve_state', 'from sky.serve import serve_utils', ] + @classmethod + def add_service(cls, job_id: int, service_handle: ServiceHandle) -> str: + code = [ + f'serve_state.add_service({job_id}, ' + f'{service_handle.service_name!r}, ' + f'{service_handle.controller_port})', + ] + return cls._build(code) + @classmethod def get_latest_info(cls, controller_port: int) -> str: code = [ @@ -628,15 +679,16 @@ def stream_replica_logs(cls, def stream_serve_process_logs(cls, service_name: str, stream_controller: bool, follow: bool) -> str: code = [ - f'serve_utils.stream_serve_process_logs({service_name!r}, ' - f'{stream_controller}, follow={follow})', + f'msg = serve_utils.stream_serve_process_logs({service_name!r}, ' + f'{stream_controller}, follow={follow})', 'print(msg, flush=True)' ] return cls._build(code) @classmethod - def cleanup_service_files(cls, service_name: str) -> str: + def cleanup_service(cls, service_name: str) -> str: code = [ - f'serve_utils.cleanup_service_files({service_name!r})', + f'serve_utils.cleanup_service_utility_files({service_name!r})', + f'serve_state.remove_service({service_name!r})', ] return cls._build(code) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index f35c6ef3ba1..077333dcf8b 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -39,7 +39,7 @@ # lifetime of the job. TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS' -SKYLET_VERSION = '2' +SKYLET_VERSION = '3' SKYLET_VERSION_FILE = '~/.sky/skylet_version' # `sky spot dashboard`-related diff --git a/sky/skylet/events.py b/sky/skylet/events.py index a9e4db42b73..87eef6ac00a 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -13,6 +13,7 @@ from sky import sky_logging from sky.backends import backend_utils from sky.backends import cloud_vm_ray_backend +from sky.serve import serve_utils from sky.skylet import autostop_lib from sky.skylet import job_lib from sky.spot import spot_utils @@ -70,6 +71,14 @@ def _run(self): spot_utils.update_spot_job_status() +class RefreshServiceStatusEvent(SkyletEvent): + """Skylet event for monitoring sky serve controller jobs.""" + EVENT_INTERVAL_SECONDS = 5 + + def _run(self): + serve_utils.refresh_service_status() + + class AutostopEvent(SkyletEvent): """Skylet event for autostop. diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py index 6bbb51e7a37..547d75864d8 100644 --- a/sky/skylet/skylet.py +++ b/sky/skylet/skylet.py @@ -18,6 +18,7 @@ # Otherwise, the abnormal spot job status update will be delayed # until the next job update event. events.SpotJobUpdateEvent(), + events.RefreshServiceStatusEvent(), ] while True: diff --git a/sky/status_lib.py b/sky/status_lib.py index 2d093818807..82ceaf391b7 100644 --- a/sky/status_lib.py +++ b/sky/status_lib.py @@ -70,6 +70,9 @@ class ServiceStatus(enum.Enum): # Service is being shutting down SHUTTING_DOWN = 'SHUTTING_DOWN' + # Cannot connect to controller + UNKNOWN = 'UNKNOWN' + # At least one replica is failed and no replica is ready FAILED = 'FAILED' @@ -84,6 +87,7 @@ def colored_str(self): ServiceStatus.CONTROLLER_FAILED: colorama.Fore.RED, ServiceStatus.READY: colorama.Fore.GREEN, ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, + ServiceStatus.UNKNOWN: colorama.Fore.YELLOW, ServiceStatus.FAILED: colorama.Fore.RED, } diff --git a/sky/task.py b/sky/task.py index 892ea0c80a1..c67ee5adab1 100644 --- a/sky/task.py +++ b/sky/task.py @@ -266,8 +266,8 @@ def __init__( # the underlying managed spot dag (sky.Dag object). self.spot_dag: Optional['sky.Dag'] = None - # Only set to True when 'self' is a sky serve controller task. - self.is_sky_serve_controller_task = False + # Only set when 'self' is a sky serve controller task. + self.service_handle: Optional['serve_lib.ServiceHandle'] = None # Filled in by the optimizer. If None, this Task is not planned. self.best_resources = None diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 14f24a14c34..5796894f94b 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -413,8 +413,7 @@ def _get_requested_resources(service_record: _ServiceRecord) -> 'sky.Resources': def _get_uptime(service_record: _ServiceRecord) -> str: - handle = _get_service_handle(service_record) - uptime = handle.uptime + uptime = service_record['uptime'] if uptime is None: return '-' return log_utils.readable_time_duration(uptime, absolute=True) From 8799e099465c682f47aa7715680e7a8ce79f6122 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 5 Oct 2023 13:52:19 -0700 Subject: [PATCH 097/223] minor --- sky/backends/local_docker_backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/backends/local_docker_backend.py b/sky/backends/local_docker_backend.py index cfdd432eda3..2b466dff4cf 100644 --- a/sky/backends/local_docker_backend.py +++ b/sky/backends/local_docker_backend.py @@ -270,6 +270,7 @@ def _execute(self, detach_run: bool, dryrun: bool = False) -> None: """ Launches the container.""" + if detach_run: raise NotImplementedError('detach_run=True is not supported in ' 'LocalDockerBackend.') From 849ba8117e08b7640f4dd36d428a4aa67d2e11dc Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 5 Oct 2023 13:55:15 -0700 Subject: [PATCH 098/223] merge controllers with normal clusters --- sky/cli.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 58915d18ae8..013cff92c7e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1845,15 +1845,9 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, num_pending_autostop = 0 num_pending_autostop += status_utils.show_status_table( - nonreserved_cluster_records, all) + nonreserved_cluster_records + reserved_clusters, all) status_utils.show_local_status_table(local_clusters) - if reserved_clusters: - click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}\n' - f'Controllers{colorama.Style.RESET_ALL}') - num_pending_autostop += status_utils.show_status_table( - reserved_clusters, all) - if show_spot_jobs: click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Managed spot jobs{colorama.Style.RESET_ALL}') From 4fa8aa0e22d76c051793f96c3e7ee2c84cefe2f2 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 5 Oct 2023 22:52:55 -0700 Subject: [PATCH 099/223] deprecate controller port adn refresh in infra provider --- sky/backends/backend_utils.py | 3 +- sky/backends/cloud_vm_ray_backend.py | 3 +- sky/core.py | 3 +- sky/serve/constants.py | 3 + sky/serve/controller.py | 1 - sky/serve/infra_providers.py | 5 +- sky/serve/serve_utils.py | 148 ++++++++++++++++++--------- sky/skylet/events.py | 6 +- sky/skylet/skylet.py | 2 +- 9 files changed, 113 insertions(+), 61 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 6a3d882c379..ae0951109ca 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2761,8 +2761,7 @@ def _refresh_service_record_no_lock( backend = get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) - code = serve_lib.ServeCodeGen.get_latest_info( - service_handle.controller_port) + code = serve_lib.ServeCodeGen.get_latest_info(service_name) returncode, latest_info_payload, stderr = backend.run_on_head( handle, code, diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index a842ece13fe..ee878a24b1c 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3803,8 +3803,7 @@ def tail_serve_logs(self, handle: CloudVmRayResourceHandle, else: assert replica_id is not None, service_handle code = serve_lib.ServeCodeGen.stream_replica_logs( - service_handle.service_name, service_handle.controller_port, - replica_id, follow) + service_handle.service_name, replica_id, follow) signal.signal(signal.SIGINT, backend_utils.interrupt_handler) signal.signal(signal.SIGTSTP, backend_utils.stop_handler) diff --git a/sky/core.py b/sky/core.py index 578811acdfe..f28863abf9f 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1120,8 +1120,7 @@ def serve_down(service_name: str, purge: bool = False) -> None: backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) try: - code = serve.ServeCodeGen.terminate_service( - service_handle.controller_port) + code = serve.ServeCodeGen.terminate_service(service_name) try: (returncode, terminate_service_payload, diff --git a/sky/serve/constants.py b/sky/serve/constants.py index ed6b8d6778f..260aabaa86b 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -37,6 +37,9 @@ # TODO(tian): Expose this option to users in yaml file. READINESS_PROBE_TIMEOUT = 15 +# Wait for 1 minutes for controller / load balancer to terminate. +SERVE_TERMINATE_WAIT_TIMEOUT = 60 + # The default controller resources. # We need 200 GB disk space to enable using Azure as controller, since its image # size is 150 GB. Also, we need 32 GB memory to run our controller and load diff --git a/sky/serve/controller.py b/sky/serve/controller.py index aedf09e3806..705b8a5a997 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -183,7 +183,6 @@ def terminate(request: fastapi.Request): _infra_provider = infra_providers.SkyPilotInfraProvider( args.task_yaml, args.service_name, - controller_port=args.controller_port, readiness_suffix=service_spec.readiness_suffix, initial_delay_seconds=service_spec.initial_delay_seconds, post_data=service_spec.post_data) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 78bf9349b4d..aca16b54c45 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -196,12 +196,10 @@ class InfraProvider: def __init__( self, service_name: str, - controller_port: int, readiness_suffix: str, initial_delay_seconds: int, post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: self.service_name: str = service_name - self.controller_port = controller_port self.readiness_suffix: str = readiness_suffix self.initial_delay_seconds: int = initial_delay_seconds self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data @@ -449,7 +447,6 @@ def _teardown_cluster(self, logger.info(f'Syncing down logs for cluster {cluster_name}...') code = serve_utils.ServeCodeGen.stream_replica_logs( self.service_name, - self.controller_port, replica_id, follow=False, skip_local_log_file_check=True) @@ -553,6 +550,8 @@ def _replica_prober(self) -> None: logger.info('Running replica prober.') try: self._probe_all_replicas() + serve_utils.set_service_status_from_replica_info( + self.service_name, self.get_replica_info(verbose=True)) except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # replica prober running. diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 2e189776e70..6fc0a3d4371 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -1,6 +1,7 @@ """User interface with the SkyServe.""" import base64 import collections +import copy import os import pickle import re @@ -35,6 +36,13 @@ f'{colorama.Fore.RED}Failed to find replica ' '{replica_id}. Please use `sky serve status [SERVICE_ID]`' f' to check all valid replica id.{colorama.Style.RESET_ALL}') +# The log information when a FastAPI APP terminates. +_FASTAPI_APP_TERMINATE_MSGS = [ + 'Shutting down', + 'Waiting for application shutdown.', + 'Application shutdown complete.', + 'Finished server process', +] KeyType = TypeVar('KeyType') ValueType = TypeVar('ValueType') @@ -292,16 +300,38 @@ def get_available_controller_name( key=lambda k: available_controller_to_service_num[k]), False -def replica_info_to_service_status( - replica_info: List[Dict[str, Any]]) -> status_lib.ServiceStatus: +def set_service_status_from_replica_info( + service_name: str, replica_info: List[Dict[str, Any]]) -> None: + record = serve_state.get_service_from_name(service_name) + if record is None: + raise ValueError(f'Service {service_name!r} does not exist. ' + 'Cannot refresh service status.') + if record['status'] == status_lib.ServiceStatus.SHUTTING_DOWN: + # When the service is shutting down, there is a period of time which the + # controller still responds to the request, and the replica is not + # terminated, the service status will still be READY, but we don't want + # change service status to READY. + return status2num = collections.Counter([i['status'] for i in replica_info]) # If one replica is READY, the service is READY. if status2num[status_lib.ReplicaStatus.READY] > 0: - return status_lib.ServiceStatus.READY - if sum(status2num[status] - for status in status_lib.ReplicaStatus.failed_statuses()) > 0: - return status_lib.ServiceStatus.FAILED - return status_lib.ServiceStatus.REPLICA_INIT + status = status_lib.ServiceStatus.READY + elif sum(status2num[status] + for status in status_lib.ReplicaStatus.failed_statuses()) > 0: + status = status_lib.ServiceStatus.FAILED + else: + status = status_lib.ServiceStatus.REPLICA_INIT + serve_state.set_status(service_name, status) + + +def monitor_service_controller_job_status() -> None: + services = serve_state.get_services() + for record in services: + controller_status = job_lib.get_status(record['controller_job_id']) + if controller_status is None or controller_status.is_terminal(): + # If controller job is not running, set it as controller failed. + serve_state.set_status(record['name'], + status_lib.ServiceStatus.CONTROLLER_FAILED) class ServiceHandle(object): @@ -371,13 +401,29 @@ def __setstate__(self, state): self.__dict__.update(state) -def get_latest_info(controller_port: int) -> str: +def _get_controller_port_from_service_name(service_name: str) -> int: + record = serve_state.get_service_from_name(service_name) + if record is None: + raise ValueError(f'Service {service_name!r} does not exist.') + return record['controller_port'] + + +def _get_latest_info(service_name: str, decode: bool = True) -> Dict[str, Any]: + controller_port = _get_controller_port_from_service_name(service_name) resp = requests.get( _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + '/controller/get_latest_info') - if resp.status_code != 200: - raise ValueError(f'Failed to get replica info: {resp.text}') - return common_utils.encode_payload(resp.json()) + resp.raise_for_status() + if not decode: + return resp.json() + return { + k: pickle.loads(base64.b64decode(v)) for k, v in resp.json().items() + } + + +def get_latest_info(service_name: str) -> str: + return common_utils.encode_payload( + _get_latest_info(service_name, decode=False)) def load_latest_info(payload: str) -> Dict[str, Any]: @@ -388,7 +434,8 @@ def load_latest_info(payload: str) -> Dict[str, Any]: return latest_info -def terminate_service(controller_port: int) -> str: +def terminate_service(service_name: str) -> str: + controller_port = _get_controller_port_from_service_name(service_name) resp = requests.post( _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + '/controller/terminate') @@ -479,13 +526,13 @@ def cluster_is_up() -> bool: def stream_replica_logs(service_name: str, - controller_port: int, replica_id: int, follow: bool, skip_local_log_file_check: bool = False) -> str: msg = check_service_status_healthy(service_name) if msg is not None: return msg + controller_port = _get_controller_port_from_service_name(service_name) print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process ' f'of replica {replica_id}.{colorama.Style.RESET_ALL}') local_log_file_name = generate_replica_local_log_file_name( @@ -590,6 +637,38 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool, return '' +def wait_until_controller_and_load_balancer_terminate( + service_name: str) -> None: + + def wait_until_terminate_info_appear_in_log_file(log_file: str) -> bool: + all_terminate_information = copy.copy(_FASTAPI_APP_TERMINATE_MSGS) + start_time = time.time() + with open(os.path.expanduser(log_file), 'r', newline='') as f: + for line in _follow_logs(f, exit_if_stream_end=False): + for info in all_terminate_information: + if info in line: + all_terminate_information.remove(info) + if not all_terminate_information: + return True + if (time.time() - start_time > + constants.SERVE_TERMINATE_WAIT_TIMEOUT): + break + return False + + # Wait the load balancer to terminate first since it is the first one + # to terminate and the controller will wait for it to terminate. + load_balancer_log = generate_remote_load_balancer_log_file_name( + service_name) + if not wait_until_terminate_info_appear_in_log_file(load_balancer_log): + raise ValueError( + f'{colorama.Fore.RED}Failed to wait for load balancer to ' + f'terminate.{colorama.Style.RESET_ALL}') + controller_log = generate_remote_controller_log_file_name(service_name) + if not wait_until_terminate_info_appear_in_log_file(controller_log): + raise ValueError(f'{colorama.Fore.RED}Failed to wait for controller to ' + f'terminate.{colorama.Style.RESET_ALL}') + + def cleanup_service_utility_files(service_name: str) -> None: """Cleanup utility files for a service.""" dir_name = generate_remote_service_dir_name(service_name) @@ -598,37 +677,11 @@ def cleanup_service_utility_files(service_name: str) -> None: shutil.rmtree(dir_name) -def refresh_service_status() -> None: - services = serve_state.get_services() - for record in services: - controller_status = job_lib.get_status(record['controller_job_id']) - if controller_status is None or controller_status.is_terminal(): - # If controller job is not running, set it as controller failed. - serve_state.set_status(record['name'], - status_lib.ServiceStatus.CONTROLLER_FAILED) - elif record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: - if controller_status == job_lib.JobStatus.RUNNING: - # If controller job is running, update the status to - # REPLICA_INIT. - serve_state.set_status(record['name'], - status_lib.ServiceStatus.REPLICA_INIT) - # When the service is shutting down, there is a period of time which the - # controller still responds to the request, and the replica is not - # terminated, so the return value for _service_status_from_replica_info - # will still be READY, but we don't want change service status to READY. - elif record['status'] != status_lib.ServiceStatus.SHUTTING_DOWN: - latest_info = load_latest_info( - get_latest_info(record['controller_port'])) - serve_state.set_status( - record['name'], - replica_info_to_service_status(latest_info['replica_info'])) - - class ServeCodeGen: """Code generator for SkyServe. Usage: - >> code = ServeCodeGen.get_latest_info(controller_port) + >> code = ServeCodeGen.get_latest_info(service_name) """ _PREFIX = [ 'from sky.serve import serve_state', @@ -645,17 +698,17 @@ def add_service(cls, job_id: int, service_handle: ServiceHandle) -> str: return cls._build(code) @classmethod - def get_latest_info(cls, controller_port: int) -> str: + def get_latest_info(cls, service_name: str) -> str: code = [ - f'msg = serve_utils.get_latest_info({controller_port})', + f'msg = serve_utils.get_latest_info({service_name!r})', 'print(msg, end="", flush=True)' ] return cls._build(code) @classmethod - def terminate_service(cls, controller_port: int) -> str: + def terminate_service(cls, service_name: str) -> str: code = [ - f'msg = serve_utils.terminate_service({controller_port})', + f'msg = serve_utils.terminate_service({service_name!r})', 'print(msg, end="", flush=True)' ] return cls._build(code) @@ -663,13 +716,12 @@ def terminate_service(cls, controller_port: int) -> str: @classmethod def stream_replica_logs(cls, service_name: str, - controller_port: int, replica_id: int, follow: bool, skip_local_log_file_check: bool = False) -> str: code = [ - f'msg = serve_utils.stream_replica_logs({service_name!r}, ' - f'{controller_port}, {replica_id!r}, follow={follow}, ' + 'msg = serve_utils.stream_replica_logs(' + f'{service_name!r}, {replica_id!r}, follow={follow}, ' f'skip_local_log_file_check={skip_local_log_file_check})', 'print(msg, flush=True)' ] @@ -687,6 +739,8 @@ def stream_serve_process_logs(cls, service_name: str, @classmethod def cleanup_service(cls, service_name: str) -> str: code = [ + 'serve_utils.wait_until_controller_and_load_balancer_terminate(' + f'{service_name!r})', f'serve_utils.cleanup_service_utility_files({service_name!r})', f'serve_state.remove_service({service_name!r})', ] diff --git a/sky/skylet/events.py b/sky/skylet/events.py index 87eef6ac00a..68c5c3af4ab 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -71,12 +71,12 @@ def _run(self): spot_utils.update_spot_job_status() -class RefreshServiceStatusEvent(SkyletEvent): +class ServiceJobStatusMonitorEvent(SkyletEvent): """Skylet event for monitoring sky serve controller jobs.""" - EVENT_INTERVAL_SECONDS = 5 + EVENT_INTERVAL_SECONDS = 300 def _run(self): - serve_utils.refresh_service_status() + serve_utils.monitor_service_controller_job_status() class AutostopEvent(SkyletEvent): diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py index 547d75864d8..3fc93913fea 100644 --- a/sky/skylet/skylet.py +++ b/sky/skylet/skylet.py @@ -18,7 +18,7 @@ # Otherwise, the abnormal spot job status update will be delayed # until the next job update event. events.SpotJobUpdateEvent(), - events.RefreshServiceStatusEvent(), + events.ServiceJobStatusMonitorEvent(), ] while True: From e0392c7e50457ff48001064454114d1c93c961d5 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 10 Oct 2023 10:32:10 -0700 Subject: [PATCH 100/223] Apply suggestions from code review Co-authored-by: Zhanghao Wu Co-authored-by: Zongheng Yang --- sky/backends/backend_utils.py | 4 ++-- sky/core.py | 4 ++-- sky/templates/sky-serve-controller.yaml.j2 | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index ae0951109ca..9fd4cb715a0 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2792,8 +2792,8 @@ def _refresh_service_record( global_user_state.get_service_from_name(service_name)), msg -# TODO(tian): Maybe aggregate services using same controller to reduce SSH -# overhead? +# TODO(tian): We can optimize the number of ssh connections, by querying the +# statuses of multiple services in a single ssh def refresh_service_status( service_names: Optional[Union[str, List[str]]]) -> List[Dict[str, Any]]: yellow = colorama.Fore.YELLOW diff --git a/sky/core.py b/sky/core.py index f28863abf9f..e9dd53894d1 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1134,7 +1134,7 @@ def serve_down(service_name: str, purge: bool = False) -> None: subprocess_utils.handle_returncode( returncode, - code, ('Failed when submit terminate request to controller ' + code, ('Failed when submit termination request to controller ' f'of service {service_name!r}'), stderr, stream_logs=False) @@ -1191,7 +1191,7 @@ def serve_down(service_name: str, purge: bool = False) -> None: separate_stderr=True) subprocess_utils.handle_returncode( returncode, - code, ('Failed when cleaning up service {service_name!r}'), + code, (f'Failed cleaning up service {service_name!r}'), stderr, stream_logs=False) diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index f041ab5ac52..00d96111891 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -16,7 +16,7 @@ file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} run: | - # Create working directory for this services. + # Create working directory for this service. mkdir -p {{service_dir}} # Start sky serve controller. python -u -m sky.serve.controller --service-name {{service_name}} \ From b9b0a78bd394cca843e7cc4306b3ebab61362d2f Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 10 Oct 2023 11:47:58 -0700 Subject: [PATCH 101/223] use enum in serve logs + minor --- examples/serve/ray_serve/serve.py | 11 +++--- sky/__init__.py | 4 +++ sky/backends/backend_utils.py | 6 ++-- sky/backends/cloud_vm_ray_backend.py | 7 ++-- sky/cli.py | 50 +++++++++++++++++++++++++--- sky/core.py | 49 +++++++++++++++++++-------- sky/serve/__init__.py | 1 + sky/serve/load_balancer.py | 4 ++- sky/serve/serve_utils.py | 8 +++++ 9 files changed, 110 insertions(+), 30 deletions(-) diff --git a/examples/serve/ray_serve/serve.py b/examples/serve/ray_serve/serve.py index a08918cf0f0..8aecabd3e78 100644 --- a/examples/serve/ray_serve/serve.py +++ b/examples/serve/ray_serve/serve.py @@ -1,17 +1,18 @@ from typing import Dict from ray import serve -from starlette.requests import Request +from starlette import requests -@serve.deployment(route_prefix="/", num_replicas=2) +@serve.deployment(route_prefix='/', num_replicas=2) class ModelDeployment: def __init__(self, msg: str): self._msg = msg - def __call__(self, request: Request) -> Dict: - return {"result": self._msg} + def __call__(self, request: requests.Request) -> Dict: + del request # unused + return {'result': self._msg} -app = ModelDeployment.bind(msg="Hello Ray Serve!") +app = ModelDeployment.bind(msg='Hello Ray Serve!') diff --git a/sky/__init__.py b/sky/__init__.py index 0b0a71d7907..e5902edf450 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -39,6 +39,7 @@ from sky.optimizer import Optimizer from sky.optimizer import OptimizeTarget from sky.resources import Resources +from sky.serve import ServiceComponent from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus from sky.status_lib import ReplicaStatus @@ -77,7 +78,10 @@ 'Storage', 'StorageMode', 'StoreType', + 'ServiceComponent', 'ClusterStatus', + 'ReplicaStatus', + 'ServiceStatus', 'JobStatus', # APIs 'Dag', diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 9fd4cb715a0..092b4af6ab6 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2754,10 +2754,10 @@ def _refresh_service_record_no_lock( return record, None controller_name = record['controller_name'] - cluster_record = global_user_state.get_cluster_from_name(controller_name) - assert cluster_record is not None + status, handle = refresh_cluster_status_handle(controller_name) + if status == status_lib.ClusterStatus.STOPPED: + return record, None - handle = cluster_record['handle'] backend = get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index ee878a24b1c..4690e355bb2 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3793,12 +3793,13 @@ def tail_spot_logs(self, def tail_serve_logs(self, handle: CloudVmRayResourceHandle, service_handle: serve_lib.ServiceHandle, - controller: bool, load_balancer: bool, + target: serve_lib.ServiceComponent, replica_id: Optional[int], follow: bool) -> None: - if controller or load_balancer: + if target != serve_lib.ServiceComponent.REPLICA: code = serve_lib.ServeCodeGen.stream_serve_process_logs( service_handle.service_name, - stream_controller=controller, + stream_controller=( + target == serve_lib.ServiceComponent.CONTROLLER), follow=follow) else: assert replica_id is not None, service_handle diff --git a/sky/cli.py b/sky/cli.py index f57054a510f..0842aeb07ac 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4390,6 +4390,12 @@ def _down_service(name: str): default=False, required=False, help='Show the load balancer logs of this service.') +@click.option('--target', + default=None, + type=click.Choice(['controller', 'load-balancer', 'replica'], + case_sensitive=False), + required=False, + help='Target to stream logs.') @click.argument('service_name', required=True, type=str, @@ -4402,6 +4408,7 @@ def serve_logs( controller: bool, load_balancer: bool, replica_id: Optional[int], + target: Optional[str], ): """Tail the log of a service. @@ -4417,15 +4424,48 @@ def serve_logs( \b # Tail the logs of replica 1 sky serve logs [SERVICE_ID] 1 + \b + # Specify target to stream logs by `--target` is also supported + sky serve logs --target controller [SERVICE_ID] + sky serve logs --target load-balancer [SERVICE_ID] + sky serve logs --target replica [SERVICE_ID] 1 + \b + # If both --target and --controller/--load-balancer are specified, + # --controller/--load-balancer takes precedence. + # Tail the controller logs of a service: + sky serve logs --controller --target load-balancer [SERVICE_ID] """ have_replica_id = replica_id is not None - if (controller + load_balancer + have_replica_id) != 1: + num_flags = (controller + load_balancer + have_replica_id) + if num_flags > 1: + raise click.UsageError('At most one of --controller, --load-balancer, ' + '[REPLICA_ID] can be specified.') + if num_flags == 0 and target is None: raise click.UsageError( - 'One and only one of --controller, --load-balancer, ' - '[REPLICA_ID] can be specified.') + 'One of --controller, --load-balancer, [REPLICA_ID] or --target ' + 'must be specified.') + if controller: + if target is not None: + click.secho(f'Overriding --target={target} with --controller.', + fg='yellow') + target_component = sky.ServiceComponent.CONTROLLER + elif load_balancer: + if target is not None: + click.secho(f'Overriding --target={target} with --load-balancer.', + fg='yellow') + target_component = sky.ServiceComponent.LOAD_BALANCER + elif target is not None: + # Change load-balancer to load_balancer to match the enum. + target = target.replace('-', '_') + target_component = sky.ServiceComponent(target) + if (target_component == sky.ServiceComponent.REPLICA and + not have_replica_id): + raise click.UsageError( + 'REPLICA_ID must be specified when using --target replica.') + else: + target_component = sky.ServiceComponent.REPLICA core.serve_tail_logs(service_name, - controller=controller, - load_balancer=load_balancer, + target=target_component, replica_id=replica_id, follow=follow) diff --git a/sky/core.py b/sky/core.py index e9dd53894d1..1d250cc972d 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1044,31 +1044,55 @@ def serve_status( def serve_tail_logs( service_name: str, *, - controller: bool = False, - load_balancer: bool = False, + target: Union[str, serve.ServiceComponent], replica_id: Optional[int] = None, follow: bool = True, ) -> None: """Tail logs for a service. Usage: - core.serve_tail_logs(service_name, =, follow=True/False) + core.serve_tail_logs( + service_name, + target=, + follow=False, # Optionally, default to True + # replica_id=3, # Must be specified when target is REPLICA. + ) - One and only one of must be specified: controller, load_balancer, - or replica_id. + `target` is a enum of sky.ServiceComponent, which can be one of: + - CONTROLLER + - LOAD_BALANCER + - REPLICA + Pass target as a lower-case string is also supported, e.g. + target='controller'. + To use REPLICA, you must specify `replica_id`. To tail controller logs: # follow default to True - core.serve_tail_logs(service_name, controller=True) + core.serve_tail_logs( + service_name, target=sky.ServiceComponent.CONTROLLER) To print replica 3 logs: - core.serve_tail_logs(service_name, replica_id=3, follow=False) + # Pass target as a lower-case string is also supported. + core.serve_tail_logs( + service_name, target='replica', + follow=False, replica_id=3) """ - have_replica_id = replica_id is not None - if (controller + load_balancer + have_replica_id) != 1: + if isinstance(target, str): + target = serve.ServiceComponent(target) + if not isinstance(target, serve.ServiceComponent): with ux_utils.print_exception_no_traceback(): - raise ValueError('One and only one of controller, load_balancer, ' - 'or replica_id must be specified.') + raise ValueError(f'`target` must be a string or ' + f'sky.ServiceComponent, got {type(target)}.') + if target == serve.ServiceComponent.REPLICA: + if replica_id is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + '`replica_id` must be specified when using target=REPLICA.') + else: + if replica_id is not None: + with ux_utils.print_exception_no_traceback(): + raise ValueError('`replica_id` must be None when using ' + 'target=CONTROLLER/LOAD_BALANCER.') service_record = global_user_state.get_service_from_name(service_name) if service_record is None: with ux_utils.print_exception_no_traceback(): @@ -1086,8 +1110,7 @@ def serve_tail_logs( assert isinstance(backend, backends.CloudVmRayBackend), backend backend.tail_serve_logs(handle, service_handle, - controller, - load_balancer, + target, replica_id, follow=follow) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 4a2b475d292..5bd7d861572 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -22,6 +22,7 @@ from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import load_terminate_service_result from sky.serve.serve_utils import ServeCodeGen +from sky.serve.serve_utils import ServiceComponent from sky.serve.serve_utils import ServiceHandle from sky.serve.service_spec import SkyServiceSpec diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index c1e661c8faa..5b94c51b018 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -69,7 +69,9 @@ def _sync_with_controller(self): response = session.get(self.controller_url + '/controller/is_terminating') response.raise_for_status() - if bool(response.json()['is_terminating']): + logger.debug( + f'Controller terminating status: {response.json()}') + if response.json().get('is_terminating'): logger.info('Controller is terminating. ' 'Shutting down load balancer.') os.kill(os.getpid(), signal.SIGINT) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 6fc0a3d4371..93797b18d8c 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -2,6 +2,7 @@ import base64 import collections import copy +import enum import os import pickle import re @@ -44,6 +45,13 @@ 'Finished server process', ] + +class ServiceComponent(enum.Enum): + CONTROLLER = 'controller' + LOAD_BALANCER = 'load_balancer' + REPLICA = 'replica' + + KeyType = TypeVar('KeyType') ValueType = TypeVar('ValueType') From 82d01eaa77f313502eeb3646f01e5b15d11e9b88 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 10 Oct 2023 13:18:48 -0700 Subject: [PATCH 102/223] remove stop hint --- sky/backends/backend_utils.py | 5 ----- sky/cli.py | 18 +++++------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 092b4af6ab6..1c15cc984f4 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -110,7 +110,6 @@ class ReservedClusterRecord: group_name: str check: Callable[[str], bool] sky_status_hint: str - decline_stop_hint: str decline_cancel_hint: str check_cluster_name_hint: str @@ -125,8 +124,6 @@ class ReservedClusterGroup(enum.Enum): sky_status_hint=( f'* To see detailed spot job status: {colorama.Style.BRIGHT}' f'sky spot queue{colorama.Style.RESET_ALL}'), - decline_stop_hint=('Spot controller will be auto-stopped after all ' - 'spot jobs finish.'), decline_cancel_hint=( 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel Date: Tue, 10 Oct 2023 14:08:58 -0700 Subject: [PATCH 103/223] env vars & monir --- sky/backends/backend_utils.py | 10 +++--- sky/core.py | 40 +++++++++++++++------- sky/execution.py | 31 +++++++++-------- sky/skylet/constants.py | 8 ++++- sky/templates/sky-serve-controller.yaml.j2 | 5 +++ sky/templates/spot-controller.yaml.j2 | 5 +++ 6 files changed, 65 insertions(+), 34 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 1c15cc984f4..8e3de170895 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1066,10 +1066,10 @@ def write_cluster_config( 'disk_size': to_provision.disk_size, # If the current code is run by controller, propagate the real # calling user which should've been passed in as the - # SKYPILOT_USER env var (see spot-controller.yaml.j2), also - # execution.py::serve_up. - 'user': get_cleaned_username(os.environ.get( - 'SKYPILOT_USER', '')), + # SKYPILOT_USER env var (see + # execution.py::_shared_controller_env_vars). + 'user': get_cleaned_username( + os.environ.get(constants.USER_ENV_VAR, '')), # AWS only: 'vpc_name': skypilot_config.get_nested(('aws', 'vpc_name'), @@ -2750,7 +2750,7 @@ def _refresh_service_record_no_lock( controller_name = record['controller_name'] status, handle = refresh_cluster_status_handle(controller_name) - if status == status_lib.ClusterStatus.STOPPED: + if status is None or status == status_lib.ClusterStatus.STOPPED: return record, None backend = get_backend_from_handle(handle) diff --git a/sky/core.py b/sky/core.py index 1d250cc972d..439c8463591 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1076,6 +1076,10 @@ def serve_tail_logs( core.serve_tail_logs( service_name, target='replica', follow=False, replica_id=3) + + Raises: + sky.exceptions.ClusterNotUpError: the sky serve controller is not up. + ValueError: arguments not valid, or failed to tail the logs. """ if isinstance(target, str): target = serve.ServiceComponent(target) @@ -1100,11 +1104,17 @@ def serve_tail_logs( 'Cannot stream logs.') service_handle: serve.ServiceHandle = service_record['handle'] controller_name = service_record['controller_name'] - handle = global_user_state.get_handle_from_cluster_name(controller_name) - if handle is None: + controller_status, handle = backend_utils.refresh_cluster_status_handle( + controller_name) + if controller_status is None: with ux_utils.print_exception_no_traceback(): raise ValueError( f'Cannot find controller for service {service_name}.') + if controller_status == status_lib.ClusterStatus.STOPPED: + with ux_utils.print_exception_no_traceback(): + raise exceptions.ClusterNotUpError( + f'Controller for service {service_name} is auto-stopped.', + cluster_status=controller_status) assert isinstance(handle, backends.CloudVmRayResourceHandle), handle backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend), backend @@ -1124,6 +1134,10 @@ def serve_down(service_name: str, purge: bool = False) -> None: Args: service_name: Name of the service. purge: If true, ignore errors when cleaning up the controller. + + Raises: + ValueError: if the service does not exist. + RuntimeError: if failed to terminate the service. """ service_record = global_user_state.get_service_from_name(service_name) @@ -1136,10 +1150,15 @@ def serve_down(service_name: str, purge: bool = False) -> None: handle = global_user_state.get_handle_from_cluster_name(controller_name) controller_fetch_ip_error_message = ( - 'Failed to fetch controller IP. Please ' - 'check controller status and try again.') + 'Failed to fetch controller IP. Please refresh controller status by ' + '`sky status -r ` and try again.') - if handle is not None: + if handle is None: + if not purge: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'Cannot find controller of service {service_name!r}.') + else: backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) try: @@ -1169,14 +1188,14 @@ def serve_down(service_name: str, purge: bool = False) -> None: raise RuntimeError('Failed to terminate replica of service ' f'{service_name!r} due to request ' f'failure: {resp.text}') - msg = resp.json()['message'] + msg = resp.json().get('message') if msg: with ux_utils.print_exception_no_traceback(): raise RuntimeError( 'Unexpected message when tearing down replica of ' f'service {service_name!r}: {msg}. Please login to ' - 'the controller and make sure the service is properly ' - 'cleaned up.') + 'the controller by `ssh ` and ' + 'make sure the service is properly cleaned up.') # We want to make sure no matter what error happens, we can still # clean up the record if purge is True. @@ -1189,11 +1208,6 @@ def serve_down(service_name: str, purge: bool = False) -> None: else: with ux_utils.print_exception_no_traceback(): raise RuntimeError(e) from e - else: - if not purge: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'Cannot find controller of service {service_name!r}.') try: if handle is not None: diff --git a/sky/execution.py b/sky/execution.py index 6b86284347c..d61d0ac9dc9 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -585,17 +585,20 @@ def exec( # pylint: disable=redefined-builtin def _shared_controller_env_vars() -> Dict[str, Any]: - return { - 'SKYPILOT_USER_ID': common_utils.get_user_hash(), - 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK': 1, + env_vars: Dict[str, Any] = { + env.value: 1 for env in env_options.Options if env.get() + } + # TODO(tian): Why does spot controller not set this env variable? + env_vars.pop(env_options.Options.MINIMIZE_LOGGING.value, None) + env_vars.update({ # Should not use $USER here, as that env var can be empty when # running in a container. - 'SKYPILOT_USER': getpass.getuser(), - 'SKYPILOT_DEV': env_options.Options.IS_DEVELOPER.get(), - 'SKYPILOT_DEBUG': env_options.Options.SHOW_DEBUG_INFO.get(), - 'SKYPILOT_DISABLE_USAGE_COLLECTION': - env_options.Options.DISABLE_LOGGING.get(), - } + constants.USER_ENV_VAR: getpass.getuser(), + constants.USER_ID_ENV_VAR: common_utils.get_user_hash(), + # Skip cloud identity check to avoid the overhead. + env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: 1, + }) + return env_vars @usage_lib.entrypoint @@ -715,12 +718,13 @@ def spot_launch( remote_user_config_path = ( f'{prefix}/{dag.name}-{dag_uuid}.config_yaml') common_utils.dump_yaml(tmpfile.name, config_dict) + spot_env_vars[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = ( + remote_user_config_path) vars_to_fill.update({ 'user_config_path': tmpfile.name, 'remote_user_config_path': remote_user_config_path, + 'envs': spot_env_vars, }) - spot_env_vars[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = ( - remote_user_config_path) # Override the controller resources with the ones specified in the # config. @@ -760,8 +764,6 @@ def spot_launch( controller_task.spot_dag = dag assert len(controller_task.resources) == 1 - controller_task.update_envs(spot_env_vars) - print(f'{colorama.Fore.YELLOW}' f'Launching managed spot job {dag.name!r} from spot controller...' f'{colorama.Style.RESET_ALL}') @@ -1082,6 +1084,7 @@ def serve_up( 'app_port': task.service.app_port, 'controller_log_file': controller_log_file, 'load_balancer_log_file': load_balancer_log_file, + 'envs': _shared_controller_env_vars(), } controller_yaml_path = serve.generate_controller_yaml_file_name( service_name) @@ -1096,8 +1099,6 @@ def serve_up( # to support a larger number of services. controller_task.service_handle = service_handle - controller_task.update_envs(_shared_controller_env_vars()) - fore = colorama.Fore style = colorama.Style print(f'\n{fore.YELLOW}Launching controller for {service_name!r}...' diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 5785e3d9a4f..e557dbf34d4 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -81,9 +81,15 @@ # The name for the environment variable that stores SkyPilot user hash, which # is mainly used to make sure sky commands runs on a VM launched by SkyPilot -# will be recognized as the same user (e.g., spot controller). +# will be recognized as the same user (e.g., spot controller or sky serve +# controller). USER_ID_ENV_VAR = 'SKYPILOT_USER_ID' +# The name for the environment variable that stores SkyPilot user name. +# Similar to USER_ID_ENV_VAR, this is mainly used to make sure sky commands +# runs on a VM launched by SkyPilot will be recognized as the same user. +USER_ENV_VAR = 'SKYPILOT_USER' + # In most clouds, cluster names can only contain lowercase letters, numbers # and hyphens. We use this regex to validate the cluster name. CLUSTER_NAME_VALID_REGEX = '[a-z]([-a-z0-9]*[a-z0-9])?' diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index 00d96111891..bfc1e662bf9 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -30,3 +30,8 @@ run: | python -u -m sky.serve.load_balancer --load-balancer-port {{load_balancer_port}} \ --app-port {{app_port}} --controller-addr http://localhost:{{controller_port}} \ > {{load_balancer_log_file}} 2>&1 + +envs: +{%- for env_name, env_value in envs.items() %} + {{env_name}}: {{env_value}} +{%- endfor %} diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index db24fbb8a42..5f28ebc29b4 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -36,3 +36,8 @@ run: | python -u -m sky.spot.controller \ {{remote_user_yaml_prefix}}/{{dag_name}}-{{uuid}}.yaml \ --job-id $SKYPILOT_INTERNAL_JOB_ID {% if retry_until_up %}--retry-until-up{% endif %} + +envs: +{%- for env_name, env_value in envs.items() %} + {{env_name}}: {{env_value}} +{%- endfor %} From ef24cda217a4c05aeedb960de9cba9b2e1818dbf Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 10 Oct 2023 14:16:40 -0700 Subject: [PATCH 104/223] use cluster regex and remove service regex --- sky/execution.py | 8 ++++++-- sky/serve/__init__.py | 1 - sky/serve/constants.py | 7 ------- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index d61d0ac9dc9..b0c7bac9bda 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -966,12 +966,16 @@ def serve_up( if service_name is None: service_name = backend_utils.generate_service_name() - if re.fullmatch(serve.SERVICE_NAME_VALID_REGEX, service_name) is None: + # The service name will be used as: + # 1. controller cluster name: 'sky-serve-controller-' + # 2. replica cluster name: '-' + # In both cases, service name shares the same regex with cluster name. + if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service name {service_name!r} is invalid: ' f'ensure it is fully matched by regex (e.g., ' 'only contains lower letters, numbers and dash): ' - f'{serve.SERVICE_NAME_VALID_REGEX}') + f'{constants.CLUSTER_NAME_VALID_REGEX}') if global_user_state.get_service_from_name(service_name) is not None: with ux_utils.print_exception_no_traceback(): diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 5bd7d861572..1be687d78e0 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -9,7 +9,6 @@ from sky.serve.constants import CONTROLLER_SYNC_INTERVAL from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.constants import SERVE_PREFIX -from sky.serve.constants import SERVICE_NAME_VALID_REGEX from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.serve_utils import gen_ports_for_serve_process from sky.serve.serve_utils import generate_controller_yaml_file_name diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 260aabaa86b..f3cbe265ce9 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -11,13 +11,6 @@ SERVE_PREFIX = '~/.sky/serve' -# This is the same with sky.skylet.constants.CLUSTER_NAME_VALID_REGEX -# The service name will be used as: -# 1. controller cluster name: 'sky-serve-controller-' -# 2. replica cluster name: '-' -# In both cases, service name shares the same regex with cluster name. -SERVICE_NAME_VALID_REGEX = '[a-z]([-a-z0-9]*[a-z0-9])?' - # The filelock for selecting controller and service ports when starting a # service. In our current multi-service controller implementation, we need to: # 1. Select a controller if there are some existing controllers; From 4870e7388f6d6db04d006eb56c7b1c5f45729ed2 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 10 Oct 2023 14:28:45 -0700 Subject: [PATCH 105/223] remove pylint hint & rename app_port -> replica_port --- sky/backends/backend_utils.py | 3 --- sky/execution.py | 8 +++----- sky/serve/load_balancer.py | 10 +++++----- sky/serve/service_spec.py | 18 +++++++++--------- sky/templates/sky-serve-controller.yaml.j2 | 4 ++-- 5 files changed, 19 insertions(+), 24 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 8e3de170895..03a606d1513 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2774,9 +2774,6 @@ def _refresh_service_record_no_lock( def _refresh_service_record( service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: try: - # TODO(tian): remove pylint disabling when filelock - # version updated - # pylint: disable=abstract-class-instantiated with filelock.FileLock(SERVICE_STATUS_LOCK_PATH.format(service_name), SERVICE_STATUS_LOCK_TIMEOUT_SECONDS): return _refresh_service_record_no_lock(service_name) diff --git a/sky/execution.py b/sky/execution.py index b0c7bac9bda..d62687c46ff 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1009,7 +1009,8 @@ def serve_up( 'Specifying ports in resources is not allowed. SkyServe will ' 'use the port specified in the service section.') - task.set_resources(requested_resources.copy(ports=[task.service.app_port])) + task.set_resources( + requested_resources.copy(ports=[task.service.replica_port])) # Use filelock here to make sure only one process can write to database # at the same time. Then we generate available controller name again to @@ -1018,9 +1019,6 @@ def serve_up( # In the same time, generate ports for the controller and load balancer. # Use file lock to make sure the ports are unique to each service. try: - # TODO(tian): remove pylint disabling when filelock - # version updated - # pylint: disable=abstract-class-instantiated with filelock.FileLock( os.path.expanduser(serve.CONTROLLER_FILE_LOCK_PATH), serve.CONTROLLER_FILE_LOCK_TIMEOUT): @@ -1085,7 +1083,7 @@ def serve_up( 'service_name': service_name, 'controller_port': controller_port, 'load_balancer_port': load_balancer_port, - 'app_port': task.service.app_port, + 'replica_port': task.service.replica_port, 'controller_log_file': controller_log_file, 'load_balancer_log_file': load_balancer_log_file, 'envs': _shared_controller_env_vars(), diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 5b94c51b018..cb6bb563e61 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -29,7 +29,7 @@ class SkyServeLoadBalancer: """ def __init__( - self, controller_url: str, load_balancer_port: int, app_port: int, + self, controller_url: str, load_balancer_port: int, replica_port: int, load_balancing_policy: load_balancing_policies.LoadBalancingPolicy ) -> None: self.app = fastapi.FastAPI() @@ -37,7 +37,7 @@ def __init__( # This is the port where the load balancer listens to. self.load_balancer_port = load_balancer_port # This is the port where the replica app listens to. - self.app_port = app_port + self.replica_port = replica_port self.load_balancing_policy = load_balancing_policy self.setup_query_interval() @@ -107,7 +107,7 @@ async def _redirect_handler(self, request: fastapi.Request): 'Use "sky serve status [SERVICE_ID]" ' 'to check the replica status.') - path = f'http://{replica_ip}:{self.app_port}{request.url.path}' + path = f'http://{replica_ip}:{self.replica_port}{request.url.path}' logger.info(f'Redirecting request to {path}') return fastapi.responses.RedirectResponse(url=path) @@ -133,7 +133,7 @@ def run(self): type=int, help='Port to run the load balancer on.', required=True) - parser.add_argument('--app-port', + parser.add_argument('--replica-port', type=int, help='Port that runs app on replica.', required=True) @@ -150,6 +150,6 @@ def run(self): load_balancer = SkyServeLoadBalancer( controller_url=args.controller_addr, load_balancer_port=args.load_balancer_port, - app_port=args.app_port, + replica_port=args.replica_port, load_balancing_policy=_load_balancing_policy) load_balancer.run() diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index a3e78a0e3d2..db9847cc6ba 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -19,7 +19,7 @@ def __init__( self, readiness_path: str, initial_delay_seconds: int, - app_port: int, + replica_port: int, min_replicas: int, max_replicas: Optional[int] = None, qps_upper_threshold: Optional[float] = None, @@ -43,12 +43,12 @@ def __init__( f'Got: {readiness_path}') self._readiness_path = readiness_path self._initial_delay_seconds = initial_delay_seconds - if app_port < 0 or app_port > 65535: + if replica_port < 0 or replica_port > 65535: with ux_utils.print_exception_no_traceback(): raise ValueError( - f'Invalid app port: {app_port}. ' + f'Invalid app port: {replica_port}. ' 'Please use a port number between 0 and 65535.') - self._app_port = str(app_port) + self._replica_port = str(replica_port) self._min_replicas = min_replicas self._max_replicas = max_replicas self._qps_upper_threshold = qps_upper_threshold @@ -68,7 +68,7 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': 'the service YAML. Please use one of them.') service_config = {} - service_config['app_port'] = config['port'] + service_config['replica_port'] = config['port'] readiness_section = config['readiness_probe'] if isinstance(readiness_section, str): @@ -157,7 +157,7 @@ def add_if_not_none(section, key, value, no_empty: bool = False): config[section] = dict() config[section][key] = value - add_if_not_none('port', None, int(self.app_port)) + add_if_not_none('port', None, int(self.replica_port)) add_if_not_none('readiness_probe', 'path', self.readiness_path) add_if_not_none('readiness_probe', 'initial_delay_seconds', self.initial_delay_seconds) @@ -199,7 +199,7 @@ def __repr__(self) -> str: @property def readiness_suffix(self) -> str: - return f':{self._app_port}{self._readiness_path}' + return f':{self._replica_port}{self._readiness_path}' @property def readiness_path(self) -> str: @@ -210,8 +210,8 @@ def initial_delay_seconds(self) -> int: return self._initial_delay_seconds @property - def app_port(self) -> str: - return self._app_port + def replica_port(self) -> str: + return self._replica_port @property def min_replicas(self) -> int: diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index bfc1e662bf9..d451618e28f 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -28,8 +28,8 @@ run: | # foreground so that the job will not finish, thus prevent our controller # from auto down. python -u -m sky.serve.load_balancer --load-balancer-port {{load_balancer_port}} \ - --app-port {{app_port}} --controller-addr http://localhost:{{controller_port}} \ - > {{load_balancer_log_file}} 2>&1 + --replica-port {{replica_port}} --controller-addr \ + http://localhost:{{controller_port}} > {{load_balancer_log_file}} 2>&1 envs: {%- for env_name, env_value in envs.items() %} From f1406cfca79100b1e67b2dd7d5c2f0fa2db7d458 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 10 Oct 2023 14:56:25 -0700 Subject: [PATCH 106/223] change constants used in _maybe_translate_local_file_mounts_and_sync_up --- sky/execution.py | 38 +++++++++++++++++++++----------------- sky/skylet/constants.py | 6 ++++++ sky/spot/constants.py | 6 ------ 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index d62687c46ff..aaf2102d0a5 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -646,7 +646,7 @@ def spot_launch( dag_utils.fill_default_spot_config_in_dag_for_spot_launch(dag) for task_ in dag.tasks: - _maybe_translate_local_file_mounts_and_sync_up(task_) + _maybe_translate_local_file_mounts_and_sync_up(task_, prefix='spot') with tempfile.NamedTemporaryFile(prefix=f'spot-dag-{dag.name}-', mode='w') as f: @@ -780,7 +780,8 @@ def spot_launch( ) -def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): +def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task, + prefix: str): """Translates local->VM mounts into Storage->VM, then syncs up any Storage. Eagerly syncing up local->Storage ensures Storage->VM would work at task @@ -818,7 +819,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): # Step 1: Translate the workdir to SkyPilot storage. new_storage_mounts = {} if task.workdir is not None: - bucket_name = spot.constants.SPOT_WORKDIR_BUCKET_NAME.format( + bucket_name = constants.WORKDIR_BUCKET_NAME.format( username=getpass.getuser(), id=run_id) workdir = task.workdir task.workdir = None @@ -852,7 +853,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): if os.path.isfile(os.path.abspath(os.path.expanduser(src))): copy_mounts_with_file_in_src[dst] = src continue - bucket_name = spot.constants.SPOT_FM_BUCKET_NAME.format( + bucket_name = constants.FM_BUCKET_NAME.format( username=getpass.getuser(), id=f'{run_id}-{i}', ) @@ -868,12 +869,13 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): # Step 3: Translate local file mounts with file in src to SkyPilot storage. # Hard link the files in src to a temporary directory, and upload folder. - local_fm_path = os.path.join( - tempfile.gettempdir(), - spot.constants.SPOT_FM_LOCAL_TMP_DIR.format(id=run_id)) + local_fm_path = os.path.join(tempfile.gettempdir(), + constants.FM_LOCAL_TMP_DIR.format(id=run_id)) os.makedirs(local_fm_path, exist_ok=True) - file_bucket_name = spot.constants.SPOT_FM_FILE_ONLY_BUCKET_NAME.format( + file_bucket_name = constants.FM_FILE_ONLY_BUCKET_NAME.format( username=getpass.getuser(), id=run_id) + file_mount_remote_tmp_dir = constants.FM_REMOTE_TMP_DIR.format( + prefix=prefix) if copy_mounts_with_file_in_src: src_to_file_id = {} for i, src in enumerate(set(copy_mounts_with_file_in_src.values())): @@ -882,18 +884,17 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): os.path.join(local_fm_path, f'file-{i}')) new_storage_mounts[ - spot.constants. - SPOT_FM_REMOTE_TMP_DIR] = storage_lib.Storage.from_yaml_config({ + file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config({ 'name': file_bucket_name, 'source': local_fm_path, 'persistent': False, 'mode': 'MOUNT', }) - if spot.constants.SPOT_FM_REMOTE_TMP_DIR in original_storage_mounts: + if file_mount_remote_tmp_dir in original_storage_mounts: with ux_utils.print_exception_no_traceback(): raise ValueError( 'Failed to translate file mounts, due to the default ' - f'destination {spot.constants.SPOT_FM_REMOTE_TMP_DIR} ' + f'destination {file_mount_remote_tmp_dir} ' 'being taken.') sources = list(src_to_file_id.keys()) sources_str = '\n\t'.join(sources) @@ -917,7 +918,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task): # /original-dst: s3://spot-fm-file-only-bucket-name/file-0 new_file_mounts = {} for dst, src in copy_mounts_with_file_in_src.items(): - storage = task.storage_mounts[spot.constants.SPOT_FM_REMOTE_TMP_DIR] + storage = task.storage_mounts[file_mount_remote_tmp_dir] store_type = list(storage.stores.keys())[0] store_prefix = storage_lib.get_store_prefix(store_type) bucket_url = store_prefix + file_bucket_name @@ -1050,9 +1051,7 @@ def serve_up( f'{service_name!r}. Please check if there are some ' '`sky serve up` process hanging abnormally.') from e - # TODO(tian): Use skyserve constants, or maybe refactor these constants - # out of spot constants since their name is mostly not spot-specific. - _maybe_translate_local_file_mounts_and_sync_up(task) + _maybe_translate_local_file_mounts_and_sync_up(task, prefix='serve') ephemeral_storage = [] if task.storage_mounts is not None: for storage in task.storage_mounts.values(): @@ -1152,4 +1151,9 @@ def serve_up( f'{style.RESET_ALL}{fore.CYAN}' f'{handle.head_ip}:{load_balancer_port}{style.RESET_ALL}') print(f'{fore.GREEN}Starting replicas now...{style.RESET_ALL}') - print('Please use the above command to find the latest status.') + print('\nTo monitor if replicas are ready:' + f'\n\t{backend_utils.BOLD}watch -n10 sky serve status ' + f'{service_name}{backend_utils.RESET_BOLD}' + '\nTo send a test request:' + f'\n\t{backend_utils.BOLD}curl -L $(sky serve status ' + f'{service_name} --endpoint){backend_utils.RESET_BOLD}') diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index e557dbf34d4..23d4f9971b6 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -93,3 +93,9 @@ # In most clouds, cluster names can only contain lowercase letters, numbers # and hyphens. We use this regex to validate the cluster name. CLUSTER_NAME_VALID_REGEX = '[a-z]([-a-z0-9]*[a-z0-9])?' + +WORKDIR_BUCKET_NAME = 'skypilot-workdir-{username}-{id}' +FM_BUCKET_NAME = 'skypilot-filemounts-folder-{username}-{id}' +FM_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}' +FM_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}' +FM_REMOTE_TMP_DIR = '/tmp/sky-{prefix}-filemounts-files' diff --git a/sky/spot/constants.py b/sky/spot/constants.py index 4213e86d525..2e01d64e939 100644 --- a/sky/spot/constants.py +++ b/sky/spot/constants.py @@ -7,12 +7,6 @@ SPOT_TASK_YAML_PREFIX = '~/.sky/spot_tasks' -SPOT_WORKDIR_BUCKET_NAME = 'skypilot-workdir-{username}-{id}' -SPOT_FM_BUCKET_NAME = 'skypilot-filemounts-folder-{username}-{id}' -SPOT_FM_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}' -SPOT_FM_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}' -SPOT_FM_REMOTE_TMP_DIR = '/tmp/sky-spot-filemounts-files' - # Resources as a dict for the spot controller. # Use default CPU instance type for spot controller, i.e. # m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB) From 686ebd90164d50582661f41f36abd0509b1c0164 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Oct 2023 10:43:16 -0700 Subject: [PATCH 107/223] add --endpoint and ux --- sky/cli.py | 32 +++++++++++++++++++++++++++-- sky/execution.py | 8 +++----- sky/utils/cli_utils/status_utils.py | 15 ++++++++++---- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 61a3cbe1f67..94ce7685771 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4150,6 +4150,11 @@ def serve_up( is_flag=True, required=False, help='Show all information in full.') +@click.option('--endpoint', + default=False, + is_flag=True, + required=False, + help='Show service endpoint.') @click.argument('service_names', required=False, type=str, @@ -4157,11 +4162,12 @@ def serve_up( **_get_shell_complete_args(_complete_service_name)) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def serve_status(all: bool, service_names: List[str]): +def serve_status(all: bool, endpoint: bool, service_names: List[str]): """Show statuses of SkyServe service. Show detailed statuses of the service. If SERVICE_NAME is not provided, - show all services' status. + show all services' status. If --endpoint is specified, output the endpoint + of the service only. Each service can have one of the following statuses: @@ -4232,6 +4238,28 @@ def serve_status(all: bool, service_names: List[str]): # Only show status of my-service sky serve status my-service """ + if endpoint: + if len(service_names) != 1: + plural = 's' if len(service_names) > 1 else '' + service_num = (str(len(service_names)) + if len(service_names) > 0 else 'No') + raise click.UsageError( + f'{service_num} service{plural} specified. Please specify an' + ' existing service to show its endpoint. Usage: ' + '`sky serve status --endpoint `') + service_name = service_names[0] + record = global_user_state.get_service_from_name(service_name) + if record is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'\nService {service_name!r} not found.') + service_endpoint = status_utils.get_endpoint(record) + if service_endpoint is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Endpoint not found for service {service_name!r}. ' + 'Please check whether the service is ready.') + click.echo(service_endpoint) + return query_services: Optional[List[str]] = None if service_names: query_services = _get_glob_services(service_names) diff --git a/sky/execution.py b/sky/execution.py index aaf2102d0a5..45d5555783f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -588,8 +588,6 @@ def _shared_controller_env_vars() -> Dict[str, Any]: env_vars: Dict[str, Any] = { env.value: 1 for env in env_options.Options if env.get() } - # TODO(tian): Why does spot controller not set this env variable? - env_vars.pop(env_options.Options.MINIMIZE_LOGGING.value, None) env_vars.update({ # Should not use $USER here, as that env var can be empty when # running in a container. @@ -1151,9 +1149,9 @@ def serve_up( f'{style.RESET_ALL}{fore.CYAN}' f'{handle.head_ip}:{load_balancer_port}{style.RESET_ALL}') print(f'{fore.GREEN}Starting replicas now...{style.RESET_ALL}') - print('\nTo monitor if replicas are ready:' - f'\n\t{backend_utils.BOLD}watch -n10 sky serve status ' + print('\nTo monitor replica status:' + f'\t{backend_utils.BOLD}watch -n10 sky serve status ' f'{service_name}{backend_utils.RESET_BOLD}' '\nTo send a test request:' - f'\n\t{backend_utils.BOLD}curl -L $(sky serve status ' + f'\t\t{backend_utils.BOLD}curl -L $(sky serve status ' f'{service_name} --endpoint){backend_utils.RESET_BOLD}') diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 5796894f94b..af376f090cf 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -127,7 +127,7 @@ def show_service_table(service_records: List[_ServiceRecord], show_all: bool): StatusColumn('CONTROLLER_NAME', _get_controller_name, show_by_default=False), - StatusColumn('ENDPOINT', _get_endpoint), + StatusColumn('ENDPOINT', _get_display_endpoint), StatusColumn('POLICY', _get_policy, show_by_default=False), StatusColumn('REQUESTED_RESOURCES', _get_requested_resources, @@ -432,15 +432,22 @@ def _get_replicas(service_record: _ServiceRecord) -> str: return f'{ready_replica_num}/{total_replica_num}' -def _get_endpoint(service_record: _ServiceRecord) -> str: +def get_endpoint(service_record: _ServiceRecord) -> Optional[str]: handle = _get_service_handle(service_record) if handle.endpoint_ip is None: - return '-' + return None if handle.load_balancer_port is None: - return '-' + return None return f'{handle.endpoint_ip}:{handle.load_balancer_port}' +def _get_display_endpoint(service_record: _ServiceRecord) -> str: + endpoint = get_endpoint(service_record) + if endpoint is None: + return '-' + return endpoint + + def _get_service_status( service_record: _ServiceRecord) -> status_lib.ServiceStatus: return service_record['status'] From 9ee2cf0169a3aafbcbb9e72e8d2d08128be0cd18 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Oct 2023 12:45:56 -0700 Subject: [PATCH 108/223] new termination of controller & lb; minor suggestions --- sky/serve/controller.py | 7 +-- sky/serve/infra_providers.py | 77 ++++++++++++++-------------- sky/serve/load_balancer.py | 5 +- sky/serve/load_balancing_policies.py | 16 +++--- sky/serve/serve_utils.py | 63 +++++------------------ 5 files changed, 67 insertions(+), 101 deletions(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 705b8a5a997..a4921011303 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -6,9 +6,7 @@ import asyncio import base64 import logging -import os import pickle -import signal import threading import time from typing import Optional @@ -23,6 +21,7 @@ from sky.serve import autoscalers from sky.serve import infra_providers from sky.serve import serve_state +from sky.serve import serve_utils from sky.utils import env_options # Use the explicit logger name so that the logger is under the @@ -64,7 +63,9 @@ def _check_terminate(self): # return of /terminate request is not ready yet. time.sleep(1) logger.info('Terminate controller...') - os.kill(os.getpid(), signal.SIGINT) + # TODO(tian): Directly kill all threads and cleanup using db + # record, instead of waiting the threads to receive signal. + serve_utils.kill_children_and_self_processes() time.sleep(10) def run(self) -> None: diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index aca16b54c45..92ab5ac7d6b 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -1,5 +1,6 @@ """InfraProvider: handles the creation and deletion of endpoint replicas.""" from concurrent import futures +import dataclasses import enum import logging import random @@ -49,37 +50,36 @@ class ProcessStatus(enum.Enum): # The process is running RUNNING = 'RUNNING' - # The process is finished and success - SUCCESS = 'SUCCESS' + # The process is finished and succeeded + SUCCEEDED = 'SUCCEEDED' # The process failed FAILED = 'FAILED' +@dataclasses.dataclass class ReplicaStatusProperty: """Some properties that determine replica status.""" - - def __init__(self) -> None: - # Process status of sky.launch - # Initial value is RUNNING since each `ReplicaInfo` is created - # when `sky.launch` is called. - self.sky_launch_status: ProcessStatus = ProcessStatus.RUNNING - # User job status in [FAILED, FAILED_SETUP] - self.user_app_failed: bool = False - # Latest readiness probe result - self.service_ready_now: bool = False - # Whether the service has been ready at least once - # If service was not ready before, we count how long it takes to startup - # and compare it with the initial delay seconds; otherwise, we count how - # many consecutive failures it has. - self.service_once_ready: bool = False - # Process status of sky.down. None means sky.down is not called yet. - self.sky_down_status: Optional[ProcessStatus] = None - - def is_scale_down_no_failure(self) -> bool: - if self.sky_launch_status != ProcessStatus.SUCCESS: + # Process status of sky.launch + # Initial value is RUNNING since each `ReplicaInfo` is created + # when `sky.launch` is called. + sky_launch_status: ProcessStatus = ProcessStatus.RUNNING + # User job status in [FAILED, FAILED_SETUP] + user_app_failed: bool = False + # Latest readiness probe result + service_ready_now: bool = False + # Whether the service has been ready at least once + # If service was not ready before, we count how long it takes to startup + # and compare it with the initial delay seconds; otherwise, we count how + # many consecutive failures it has. + service_once_ready: bool = False + # Process status of sky.down. None means sky.down is not called yet. + sky_down_status: Optional[ProcessStatus] = None + + def is_scale_down_succeeded(self) -> bool: + if self.sky_launch_status != ProcessStatus.SUCCEEDED: return False - if self.sky_down_status != ProcessStatus.SUCCESS: + if self.sky_down_status != ProcessStatus.SUCCEEDED: return False if self.user_app_failed: return False @@ -88,7 +88,7 @@ def is_scale_down_no_failure(self) -> bool: return self.service_once_ready def should_track_status(self) -> bool: - if self.sky_launch_status != ProcessStatus.SUCCESS: + if self.sky_launch_status != ProcessStatus.SUCCEEDED: return False if self.sky_down_status is not None: return False @@ -277,7 +277,7 @@ def _refresh_process_pool(self) -> None: self._teardown_cluster(cluster_name, sync_down_logs=True) else: info.status_property.sky_launch_status = ( - ProcessStatus.SUCCESS) + ProcessStatus.SUCCEEDED) for cluster_name, p in list(self.down_process_pool.items()): if p.poll() is not None: logger.info(f'Down process for {cluster_name} finished.') @@ -290,12 +290,12 @@ def _refresh_process_pool(self) -> None: ProcessStatus.FAILED) else: info.status_property.sky_down_status = ( - ProcessStatus.SUCCESS) + ProcessStatus.SUCCEEDED) # Failed replica still count as a replica. In our current # design, we want to fail early if user code have any error. # This will prevent infinite loop of teardown and # re-provision. - if info.status_property.is_scale_down_no_failure(): + if info.status_property.is_scale_down_succeeded(): # This means the cluster is deleted due to # a scale down. Delete the replica info # so it won't count as a replica. @@ -413,9 +413,9 @@ def _launch_cluster(self, replica_id: int) -> None: # after we change to python API. cmd = ['sky', 'launch', self.task_yaml_path, '-c', cluster_name, '-y'] cmd.extend(['--detach-setup', '--detach-run', '--retry-until-up']) - fn = serve_utils.generate_replica_launch_log_file_name( + log_file_name = serve_utils.generate_replica_launch_log_file_name( self.service_name, replica_id) - with open(fn, 'w') as f: + with open(log_file_name, 'w') as f: # pylint: disable=consider-using-with p = subprocess.Popen(cmd, stdin=subprocess.DEVNULL, @@ -466,9 +466,9 @@ def _teardown_cluster(self, logger.info(f'Deleting SkyPilot cluster {cluster_name}') cmd = ['sky', 'down', cluster_name, '-y'] - fn = serve_utils.generate_replica_down_log_file_name( + log_file_name = serve_utils.generate_replica_down_log_file_name( self.service_name, replica_id) - with open(fn, 'w') as f: + with open(log_file_name, 'w') as f: # pylint: disable=consider-using-with p = subprocess.Popen(cmd, stdin=subprocess.DEVNULL, @@ -516,7 +516,7 @@ def terminate(self) -> Optional[str]: self._teardown_cluster(name, sync_down_logs=False) info = self.replica_info[name] # Set to success here for correctly display as shutting down - info.status_property.sky_launch_status = ProcessStatus.SUCCESS + info.status_property.sky_launch_status = ProcessStatus.SUCCEEDED msg = [] for name, info in self.replica_info.items(): if info.status in [ @@ -573,17 +573,18 @@ def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: replica_ip = info.ip try: msg = '' - readiness_suffix = f'http://{replica_ip}{self.readiness_suffix}' + # TODO(tian): Support HTTPS in the future. + readiness_path = f'http://{replica_ip}{self.readiness_suffix}' if self.post_data is not None: msg += 'Post' response = requests.post( - readiness_suffix, + readiness_path, json=self.post_data, timeout=constants.READINESS_PROBE_TIMEOUT) else: msg += 'Get' response = requests.get( - readiness_suffix, + readiness_path, timeout=constants.READINESS_PROBE_TIMEOUT) msg += (f' request to {replica_ip} returned status code ' f'{response.status_code}') @@ -619,10 +620,10 @@ def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: logger.info(f'Replicas to probe: {replica_to_probe}') for future in futures.as_completed(probe_futures): - cluster_name, res = future.result() + cluster_name, probe_succeeded = future.result() info = self.replica_info[cluster_name] - info.status_property.service_ready_now = res - if res: + info.status_property.service_ready_now = probe_succeeded + if probe_succeeded: info.consecutive_failure_times.clear() if not info.status_property.service_once_ready: info.status_property.service_once_ready = True diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index cb6bb563e61..7a3afe43321 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -1,7 +1,5 @@ """LoadBalancer: redirect any incoming request to an endpoint replica.""" import argparse -import os -import signal import threading import time @@ -13,6 +11,7 @@ from sky import sky_logging from sky.serve import constants from sky.serve import load_balancing_policies +from sky.serve import serve_utils # Use the explicit logger name so that the logger is under the # `sky.serve.load_balancer` namespace when executed directly, so as @@ -74,7 +73,7 @@ def _sync_with_controller(self): if response.json().get('is_terminating'): logger.info('Controller is terminating. ' 'Shutting down load balancer.') - os.kill(os.getpid(), signal.SIGINT) + serve_utils.kill_children_and_self_processes() # send request num in last query interval response = session.post( self.controller_url + '/controller/update_num_requests', diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index ba55650697c..e187b1874a9 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -1,8 +1,8 @@ """LoadBalancingPolicy: Policy to select endpoint.""" -from collections import deque +import collections import logging import time -from typing import Deque, Optional, Set +from typing import Deque, List, Optional, Set import fastapi @@ -17,7 +17,7 @@ class LoadBalancingPolicy: def __init__(self) -> None: self.ready_replicas: Set[str] = set() self.request_count: int = 0 - self.request_timestamps: Deque[float] = deque() + self.request_timestamps: Deque[float] = collections.deque() self.query_interval: Optional[float] = None def increment_request_count(self, count: int = 1) -> None: @@ -54,18 +54,20 @@ class RoundRobinPolicy(LoadBalancingPolicy): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.replicas_queue: Deque[str] = deque() + self.replicas_queue: List[str] = [] + self.index = 0 def set_ready_replicas(self, ready_replicas: Set[str]) -> None: if set(ready_replicas) != set(self.ready_replicas): self.ready_replicas = ready_replicas - self.replicas_queue = deque(ready_replicas) + self.replicas_queue = list(ready_replicas) + self.index = 0 def select_replica(self, request: fastapi.Request) -> Optional[str]: if not self.replicas_queue: return None - replica_ip = self.replicas_queue.popleft() - self.replicas_queue.append(replica_ip) + replica_ip = self.replicas_queue[self.index] + self.index = (self.index + 1) % len(self.replicas_queue) request_repr = (' None: + subprocess_utils.kill_children_processes() + os.kill(os.getpid(), signal.SIGKILL) + + def get_existing_controller_names() -> Set[str]: """Get existing sky serve controller names. @@ -276,6 +274,9 @@ def get_available_controller_name( and have available slots for services. If multiple controllers are available, choose the one with most number of services to decrease the number of controllers. + This function needs to be called within a lock, to avoid concurrency issue + from `existing_controllers` being staled, also, to avoid multiple + `sky serve up` select the same last slot on a controller. Args: controller_resources: The resources requested for controller. @@ -296,15 +297,10 @@ def get_available_controller_name( available_controller_to_service_num[controller_name] = ( services_num_on_controller) if not available_controller_to_service_num: - new_controller_name = generate_controller_cluster_name( - existing_controllers) - # This check should always be true since we already checked the - # service name is valid in `sky.serve_up`. - clouds.Cloud.check_cluster_name_is_valid(new_controller_name) - return new_controller_name, True + return generate_controller_cluster_name(existing_controllers), True # If multiple controllers are available, choose the one with most number of # services. - return max(available_controller_to_service_num, + return max(available_controller_to_service_num.keys(), key=lambda k: available_controller_to_service_num[k]), False @@ -645,38 +641,6 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool, return '' -def wait_until_controller_and_load_balancer_terminate( - service_name: str) -> None: - - def wait_until_terminate_info_appear_in_log_file(log_file: str) -> bool: - all_terminate_information = copy.copy(_FASTAPI_APP_TERMINATE_MSGS) - start_time = time.time() - with open(os.path.expanduser(log_file), 'r', newline='') as f: - for line in _follow_logs(f, exit_if_stream_end=False): - for info in all_terminate_information: - if info in line: - all_terminate_information.remove(info) - if not all_terminate_information: - return True - if (time.time() - start_time > - constants.SERVE_TERMINATE_WAIT_TIMEOUT): - break - return False - - # Wait the load balancer to terminate first since it is the first one - # to terminate and the controller will wait for it to terminate. - load_balancer_log = generate_remote_load_balancer_log_file_name( - service_name) - if not wait_until_terminate_info_appear_in_log_file(load_balancer_log): - raise ValueError( - f'{colorama.Fore.RED}Failed to wait for load balancer to ' - f'terminate.{colorama.Style.RESET_ALL}') - controller_log = generate_remote_controller_log_file_name(service_name) - if not wait_until_terminate_info_appear_in_log_file(controller_log): - raise ValueError(f'{colorama.Fore.RED}Failed to wait for controller to ' - f'terminate.{colorama.Style.RESET_ALL}') - - def cleanup_service_utility_files(service_name: str) -> None: """Cleanup utility files for a service.""" dir_name = generate_remote_service_dir_name(service_name) @@ -744,11 +708,10 @@ def stream_serve_process_logs(cls, service_name: str, ] return cls._build(code) + # TODO(tian): Move this into termination of controller @classmethod def cleanup_service(cls, service_name: str) -> str: code = [ - 'serve_utils.wait_until_controller_and_load_balancer_terminate(' - f'{service_name!r})', f'serve_utils.cleanup_service_utility_files({service_name!r})', f'serve_state.remove_service({service_name!r})', ] From ec3877a95d7feadee72f91e24503aa48d04a6758 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Oct 2023 13:36:53 -0700 Subject: [PATCH 109/223] move statuses to serve_state, minors --- sky/__init__.py | 4 +- sky/backends/backend_utils.py | 4 +- sky/cli.py | 8 +- sky/serve/__init__.py | 2 + sky/serve/controller.py | 5 +- sky/serve/infra_providers.py | 45 +++++------ sky/serve/serve_state.py | 121 ++++++++++++++++++++++++++-- sky/serve/serve_utils.py | 32 +++----- sky/serve/service_spec.py | 1 - sky/skylet/events.py | 6 +- sky/skylet/skylet.py | 5 +- sky/status_lib.py | 95 ---------------------- sky/utils/cli_utils/status_utils.py | 12 ++- 13 files changed, 174 insertions(+), 166 deletions(-) diff --git a/sky/__init__.py b/sky/__init__.py index e5902edf450..317c585bea5 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -39,11 +39,11 @@ from sky.optimizer import Optimizer from sky.optimizer import OptimizeTarget from sky.resources import Resources +from sky.serve import ReplicaStatus from sky.serve import ServiceComponent +from sky.serve import ServiceStatus from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus -from sky.status_lib import ReplicaStatus -from sky.status_lib import ServiceStatus from sky.task import Task # Aliases. diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 03a606d1513..f373889262e 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2715,7 +2715,7 @@ def _add_default_value_to_local_record( # NOTE(dev): Keep this align with sky.serve.controller.get_latest_info if record is None: return record - record['status'] = status_lib.ServiceStatus.UNKNOWN + record['status'] = serve_lib.ServiceStatus.UNKNOWN record['uptime'] = None record['replica_info'] = [] return record @@ -2745,7 +2745,7 @@ def _refresh_service_record_no_lock( service_handle: serve_lib.ServiceHandle = record['handle'] if not service_handle.endpoint_ip: # Service controller is still initializing. Skipped refresh status. - record['status'] = status_lib.ServiceStatus.CONTROLLER_INIT + record['status'] = serve_lib.ServiceStatus.CONTROLLER_INIT return record, None controller_name = record['controller_name'] diff --git a/sky/cli.py b/sky/cli.py index 94ce7685771..ea5a419b47e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -51,6 +51,7 @@ from sky import core from sky import exceptions from sky import global_user_state +from sky import serve as serve_lib from sky import sky_logging from sky import spot as spot_lib from sky import status_lib @@ -77,7 +78,6 @@ from sky.utils.cli_utils import status_utils if typing.TYPE_CHECKING: - from sky import serve as serve_lib from sky.backends import backend as backend_lib logger = sky_logging.init_logger(__name__) @@ -4271,14 +4271,14 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): f'Replicas{colorama.Style.RESET_ALL}') replica_infos = [] for service_record in service_records: - handle: 'serve_lib.ServiceHandle' = service_record['handle'] + handle: serve_lib.ServiceHandle = service_record['handle'] for replica_record in service_record['replica_info']: # Only print FAILED replicas if: # 1. --all is specified; # 2. auto_restart is not enabled (in which FAILED replica count # as one replica). - if (all or not handle.auto_restart or replica_record['status'] != - status_lib.ReplicaStatus.FAILED): + if (all or not handle.auto_restart or + replica_record['status'] != serve_lib.ReplicaStatus.FAILED): replica_record['service_name'] = service_record['name'] replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 1be687d78e0..a5d6ab93c19 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -10,6 +10,8 @@ from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.constants import SERVE_PREFIX from sky.serve.constants import SERVICES_TASK_CPU_DEMAND +from sky.serve.serve_state import ReplicaStatus +from sky.serve.serve_state import ServiceStatus from sky.serve.serve_utils import gen_ports_for_serve_process from sky.serve.serve_utils import generate_controller_yaml_file_name from sky.serve.serve_utils import generate_remote_controller_log_file_name diff --git a/sky/serve/controller.py b/sky/serve/controller.py index a4921011303..26423b41b4b 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -17,7 +17,6 @@ from sky import authentication from sky import serve from sky import sky_logging -from sky import status_lib from sky.serve import autoscalers from sky.serve import infra_providers from sky.serve import serve_state @@ -110,7 +109,7 @@ def get_latest_info(): self.infra_provider.get_replica_info(verbose=True), 'uptime': record.get('uptime', None), 'status': record.get('status', - status_lib.ServiceStatus.UNKNOWN), + serve_state.ServiceStatus.UNKNOWN), } latest_info = { k: base64.b64encode(pickle.dumps(v)).decode('utf-8') @@ -123,7 +122,7 @@ def terminate(request: fastapi.Request): del request logger.info('Terminating service...') serve_state.set_status(self.infra_provider.service_name, - status_lib.ServiceStatus.SHUTTING_DOWN) + serve_state.ServiceStatus.SHUTTING_DOWN) if self.autoscaler is not None: logger.info('Terminate autoscaler...') self.autoscaler.terminate() diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 92ab5ac7d6b..a9f0660be92 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -15,7 +15,6 @@ from sky import backends from sky import global_user_state -from sky import status_lib from sky.serve import constants from sky.serve import serve_state from sky.serve import serve_utils @@ -96,52 +95,52 @@ def should_track_status(self) -> bool: return False return True - def to_replica_status(self) -> status_lib.ReplicaStatus: + def to_replica_status(self) -> serve_state.ReplicaStatus: if self.sky_launch_status == ProcessStatus.RUNNING: # Still launching - return status_lib.ReplicaStatus.PROVISIONING + return serve_state.ReplicaStatus.PROVISIONING if self.sky_down_status is not None: if self.sky_down_status == ProcessStatus.RUNNING: # sky.down is running - return status_lib.ReplicaStatus.SHUTTING_DOWN + return serve_state.ReplicaStatus.SHUTTING_DOWN if self.sky_down_status == ProcessStatus.FAILED: # sky.down failed - return status_lib.ReplicaStatus.FAILED_CLEANUP + return serve_state.ReplicaStatus.FAILED_CLEANUP if self.user_app_failed: # Failed on user setup/run - return status_lib.ReplicaStatus.FAILED + return serve_state.ReplicaStatus.FAILED if not self.service_once_ready: # initial delay seconds exceeded - return status_lib.ReplicaStatus.FAILED + return serve_state.ReplicaStatus.FAILED if not self.service_ready_now: # Max continuous failure exceeded - return status_lib.ReplicaStatus.FAILED + return serve_state.ReplicaStatus.FAILED if self.sky_launch_status == ProcessStatus.FAILED: # sky.launch failed - return status_lib.ReplicaStatus.FAILED + return serve_state.ReplicaStatus.FAILED # This indicate it is a scale_down with correct teardown. # Should have been cleaned from the replica_info. - return status_lib.ReplicaStatus.UNKNOWN + return serve_state.ReplicaStatus.UNKNOWN if self.sky_launch_status == ProcessStatus.FAILED: # sky.launch failed # Down process should have been started. # If not started, this means some bug prevent sky.down from # executing. It is also a potential resource leak, so we mark # it as FAILED_CLEANUP. - return status_lib.ReplicaStatus.FAILED_CLEANUP + return serve_state.ReplicaStatus.FAILED_CLEANUP if self.service_ready_now: # Service is ready - return status_lib.ReplicaStatus.READY + return serve_state.ReplicaStatus.READY if self.user_app_failed: # Failed on user setup/run # Same as above - return status_lib.ReplicaStatus.FAILED_CLEANUP + return serve_state.ReplicaStatus.FAILED_CLEANUP if self.service_once_ready: # Service was ready before but not now - return status_lib.ReplicaStatus.NOT_READY + return serve_state.ReplicaStatus.NOT_READY else: # No readiness probe passed and sky.launch finished - return status_lib.ReplicaStatus.STARTING + return serve_state.ReplicaStatus.STARTING class ReplicaInfo: @@ -172,9 +171,9 @@ def ip(self) -> Optional[str]: return handle.head_ip @property - def status(self) -> status_lib.ReplicaStatus: + def status(self) -> serve_state.ReplicaStatus: replica_status = self.status_property.to_replica_status() - if replica_status == status_lib.ReplicaStatus.UNKNOWN: + if replica_status == serve_state.ReplicaStatus.UNKNOWN: logger.error('Detecting UNKNOWN replica status for cluster ' f'{self.cluster_name}') return replica_status @@ -390,13 +389,13 @@ def total_replica_num(self, count_failed_replica: bool) -> int: return len(self.replica_info) return len([ i for i in self.replica_info.values() - if i.status != status_lib.ReplicaStatus.FAILED + if i.status != serve_state.ReplicaStatus.FAILED ]) def get_ready_replicas(self) -> Set[str]: ready_replicas = set() for info in self.replica_info.values(): - if info.status == status_lib.ReplicaStatus.READY: + if info.status == serve_state.ReplicaStatus.READY: assert info.ip is not None ready_replicas.add(info.ip) return ready_replicas @@ -520,16 +519,16 @@ def terminate(self) -> Optional[str]: msg = [] for name, info in self.replica_info.items(): if info.status in [ - status_lib.ReplicaStatus.FAILED_CLEANUP, - status_lib.ReplicaStatus.UNKNOWN, + serve_state.ReplicaStatus.FAILED_CLEANUP, + serve_state.ReplicaStatus.UNKNOWN, ]: msg.append(f'Cluster with status {info.status} found. Please ' 'manually check the cloud console to make sure no ' 'resource leak.') # Skip those already deleted and those are deleting if info.status not in [ - status_lib.ReplicaStatus.FAILED, - status_lib.ReplicaStatus.SHUTTING_DOWN + serve_state.ReplicaStatus.FAILED, + serve_state.ReplicaStatus.SHUTTING_DOWN ]: self._teardown_cluster(name, sync_down_logs=False) for name, p in self.down_process_pool.items(): diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index ae139420c13..3b5a80ae73b 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -1,9 +1,12 @@ """The database for services information.""" +import collections +import enum import pathlib import sqlite3 from typing import Any, Dict, List, Optional -from sky import status_lib +import colorama + from sky.serve import constants from sky.utils import db_utils @@ -27,6 +30,113 @@ _CONN.commit() +class ReplicaStatus(enum.Enum): + """Replica status.""" + + # The replica VM is being provisioned. i.e., the `sky.launch` is still + # running. + PROVISIONING = 'PROVISIONING' + + # The replica VM is provisioned and the service is starting. This indicates + # user's `setup` section or `run` section is still running, and the + # readiness probe fails. + STARTING = 'STARTING' + + # The replica VM is provisioned and the service is ready, i.e. the + # readiness probe is passed. + READY = 'READY' + + # The service was ready before, but it becomes not ready now, i.e. the + # readiness probe fails. + NOT_READY = 'NOT_READY' + + # The replica VM is being shut down. i.e., the `sky down` is still running. + SHUTTING_DOWN = 'SHUTTING_DOWN' + + # The replica VM is once failed and has been deleted. + FAILED = 'FAILED' + + # `sky.down` failed during service teardown. This could mean resource + # leakage. + FAILED_CLEANUP = 'FAILED_CLEANUP' + + # Unknown status. This should never happen. + UNKNOWN = 'UNKNOWN' + + @classmethod + def failed_statuses(cls): + return [cls.FAILED, cls.FAILED_CLEANUP, cls.UNKNOWN] + + def colored_str(self): + color = _REPLICA_STATUS_TO_COLOR[self] + return f'{color}{self.value}{colorama.Style.RESET_ALL}' + + +_REPLICA_STATUS_TO_COLOR = { + ReplicaStatus.PROVISIONING: colorama.Fore.BLUE, + ReplicaStatus.STARTING: colorama.Fore.CYAN, + ReplicaStatus.READY: colorama.Fore.GREEN, + ReplicaStatus.NOT_READY: colorama.Fore.YELLOW, + ReplicaStatus.FAILED_CLEANUP: colorama.Fore.RED, + ReplicaStatus.SHUTTING_DOWN: colorama.Fore.MAGENTA, + ReplicaStatus.FAILED: colorama.Fore.RED, + ReplicaStatus.UNKNOWN: colorama.Fore.RED, +} + + +class ServiceStatus(enum.Enum): + """Service status as recorded in table 'services'.""" + + # Controller is initializing + CONTROLLER_INIT = 'CONTROLLER_INIT' + + # Replica is initializing and no failure + REPLICA_INIT = 'REPLICA_INIT' + + # Controller failed to initialize / controller or load balancer process + # status abnormal + CONTROLLER_FAILED = 'CONTROLLER_FAILED' + + # At least one replica is ready + READY = 'READY' + + # Service is being shutting down + SHUTTING_DOWN = 'SHUTTING_DOWN' + + # Cannot connect to controller + UNKNOWN = 'UNKNOWN' + + # At least one replica is failed and no replica is ready + FAILED = 'FAILED' + + def colored_str(self): + color = _SERVICE_STATUS_TO_COLOR[self] + return f'{color}{self.value}{colorama.Style.RESET_ALL}' + + @classmethod + def from_replica_info( + cls, replica_info: List[Dict[str, Any]]) -> 'ServiceStatus': + status2num = collections.Counter([i['status'] for i in replica_info]) + # If one replica is READY, the service is READY. + if status2num[ReplicaStatus.READY] > 0: + return cls.READY + if sum(status2num[status] + for status in ReplicaStatus.failed_statuses()) > 0: + return cls.FAILED + return cls.REPLICA_INIT + + +_SERVICE_STATUS_TO_COLOR = { + ServiceStatus.CONTROLLER_INIT: colorama.Fore.BLUE, + ServiceStatus.REPLICA_INIT: colorama.Fore.BLUE, + ServiceStatus.CONTROLLER_FAILED: colorama.Fore.RED, + ServiceStatus.READY: colorama.Fore.GREEN, + ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, + ServiceStatus.UNKNOWN: colorama.Fore.YELLOW, + ServiceStatus.FAILED: colorama.Fore.RED, +} + + def add_service(job_id: int, service_name: str, controller_port: int) -> None: """Adds a service to the database.""" with db_utils.safe_cursor(_DB_PATH) as cursor: @@ -34,9 +144,8 @@ def add_service(job_id: int, service_name: str, controller_port: int) -> None: """\ INSERT INTO services (name, controller_job_id, controller_port, status) - VALUES (?, ?, ?, ?)""", - (service_name, job_id, controller_port, - status_lib.ServiceStatus.CONTROLLER_INIT.value)) + VALUES (?, ?, ?, ?)""", (service_name, job_id, controller_port, + ServiceStatus.CONTROLLER_INIT.value)) def remove_service(service_name: str) -> None: @@ -55,7 +164,7 @@ def set_uptime(service_name: str, uptime: int) -> None: uptime=(?) WHERE name=(?)""", (uptime, service_name)) -def set_status(service_name: str, status: status_lib.ServiceStatus) -> None: +def set_status(service_name: str, status: ServiceStatus) -> None: """Sets the service status.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( @@ -70,7 +179,7 @@ def _get_service_from_row(row) -> Dict[str, Any]: 'name': name, 'controller_job_id': controller_job_id, 'controller_port': controller_port, - 'status': status_lib.ServiceStatus[status], + 'status': ServiceStatus[status], 'uptime': uptime, } diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 5fff5d35920..e6824780527 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -1,6 +1,5 @@ """User interface with the SkyServe.""" import base64 -import collections import enum import os import pickle @@ -310,32 +309,27 @@ def set_service_status_from_replica_info( if record is None: raise ValueError(f'Service {service_name!r} does not exist. ' 'Cannot refresh service status.') - if record['status'] == status_lib.ServiceStatus.SHUTTING_DOWN: + if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN: # When the service is shutting down, there is a period of time which the # controller still responds to the request, and the replica is not # terminated, the service status will still be READY, but we don't want # change service status to READY. return - status2num = collections.Counter([i['status'] for i in replica_info]) - # If one replica is READY, the service is READY. - if status2num[status_lib.ReplicaStatus.READY] > 0: - status = status_lib.ServiceStatus.READY - elif sum(status2num[status] - for status in status_lib.ReplicaStatus.failed_statuses()) > 0: - status = status_lib.ServiceStatus.FAILED - else: - status = status_lib.ServiceStatus.REPLICA_INIT - serve_state.set_status(service_name, status) + serve_state.set_status( + service_name, serve_state.ServiceStatus.from_replica_info(replica_info)) -def monitor_service_controller_job_status() -> None: +def update_service_status() -> None: services = serve_state.get_services() for record in services: + if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN: + # Skip services that is shutting down. + continue controller_status = job_lib.get_status(record['controller_job_id']) if controller_status is None or controller_status.is_terminal(): # If controller job is not running, set it as controller failed. serve_state.set_status(record['name'], - status_lib.ServiceStatus.CONTROLLER_FAILED) + serve_state.ServiceStatus.CONTROLLER_FAILED) class ServiceHandle(object): @@ -457,10 +451,10 @@ def check_service_status_healthy(service_name: str) -> Optional[str]: service_record = serve_state.get_service_from_name(service_name) if service_record is None: return f'Service {service_name!r} does not exist.' - if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_INIT: + if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT: return (f'Service {service_name!r} is still initializing its ' 'controller. Please try again later.') - if service_record['status'] == status_lib.ServiceStatus.CONTROLLER_FAILED: + if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED: return (f'Service {service_name!r}\'s controller failed. ' 'Cannot tail logs.') return None @@ -565,7 +559,7 @@ def stream_replica_logs(service_name: str, return (f'{colorama.Fore.RED}Replica {replica_id} doesn\'t exist.' f'{colorama.Style.RESET_ALL}') - def _get_replica_status() -> status_lib.ReplicaStatus: + def _get_replica_status() -> serve_state.ReplicaStatus: resp = requests.get( _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + '/controller/get_latest_info') @@ -586,7 +580,7 @@ def _get_replica_status() -> status_lib.ReplicaStatus: return target_info['status'] finish_stream = ( - lambda: _get_replica_status() != status_lib.ReplicaStatus.PROVISIONING) + lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING) with open(launch_log_file_name, 'r', newline='') as f: for line in _follow_replica_logs(f, replica_cluster_name, @@ -594,7 +588,7 @@ def _get_replica_status() -> status_lib.ReplicaStatus: exit_if_stream_end=not follow): print(line, end='', flush=True) if not follow and _get_replica_status( - ) == status_lib.ReplicaStatus.PROVISIONING: + ) == serve_state.ReplicaStatus.PROVISIONING: # Early exit if not following the logs. return '' diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index db9847cc6ba..03d94117c71 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -194,7 +194,6 @@ def __repr__(self) -> str: Readiness initial delay seconds: {self.initial_delay_seconds} Replica autoscaling policy: {self.policy_str()} Replica auto restart: {self.auto_restart} - Please refer to SkyServe document for detailed explanations. """) @property diff --git a/sky/skylet/events.py b/sky/skylet/events.py index 68c5c3af4ab..21e29038a22 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -71,12 +71,12 @@ def _run(self): spot_utils.update_spot_job_status() -class ServiceJobStatusMonitorEvent(SkyletEvent): - """Skylet event for monitoring sky serve controller jobs.""" +class ServiceUpdateEvent(SkyletEvent): + """Skylet event for updating sky serve service status.""" EVENT_INTERVAL_SECONDS = 300 def _run(self): - serve_utils.monitor_service_controller_job_status() + serve_utils.update_service_status() class AutostopEvent(SkyletEvent): diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py index 3fc93913fea..e10f91bf24b 100644 --- a/sky/skylet/skylet.py +++ b/sky/skylet/skylet.py @@ -18,7 +18,10 @@ # Otherwise, the abnormal spot job status update will be delayed # until the next job update event. events.SpotJobUpdateEvent(), - events.ServiceJobStatusMonitorEvent(), + # This is for monitoring controller job status. If it becomes + # unhealthy, this event will correctly update the controller + # status to CONTROLLER_FAILED. + events.ServiceUpdateEvent(), ] while True: diff --git a/sky/status_lib.py b/sky/status_lib.py index 82ceaf391b7..ae9a00c84de 100644 --- a/sky/status_lib.py +++ b/sky/status_lib.py @@ -49,98 +49,3 @@ class StorageStatus(enum.Enum): # Finished uploading, in terminal state READY = 'READY' - - -class ServiceStatus(enum.Enum): - """Service status as recorded in table 'services'.""" - - # Controller is initializing - CONTROLLER_INIT = 'CONTROLLER_INIT' - - # Replica is initializing and no failure - REPLICA_INIT = 'REPLICA_INIT' - - # Controller failed to initialize / controller or load balancer process - # status abnormal - CONTROLLER_FAILED = 'CONTROLLER_FAILED' - - # At least one replica is ready - READY = 'READY' - - # Service is being shutting down - SHUTTING_DOWN = 'SHUTTING_DOWN' - - # Cannot connect to controller - UNKNOWN = 'UNKNOWN' - - # At least one replica is failed and no replica is ready - FAILED = 'FAILED' - - def colored_str(self): - color = _SERVICE_STATUS_TO_COLOR[self] - return f'{color}{self.value}{colorama.Style.RESET_ALL}' - - -_SERVICE_STATUS_TO_COLOR = { - ServiceStatus.CONTROLLER_INIT: colorama.Fore.BLUE, - ServiceStatus.REPLICA_INIT: colorama.Fore.BLUE, - ServiceStatus.CONTROLLER_FAILED: colorama.Fore.RED, - ServiceStatus.READY: colorama.Fore.GREEN, - ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, - ServiceStatus.UNKNOWN: colorama.Fore.YELLOW, - ServiceStatus.FAILED: colorama.Fore.RED, -} - - -class ReplicaStatus(enum.Enum): - """Replica status.""" - - # The replica VM is being provisioned. i.e., the `sky.launch` is still - # running. - PROVISIONING = 'PROVISIONING' - - # The replica VM is provisioned and the service is starting. This indicates - # user's `setup` section or `run` section is still running, and the - # readiness probe fails. - STARTING = 'STARTING' - - # The replica VM is provisioned and the service is ready, i.e. the - # readiness probe is passed. - READY = 'READY' - - # The service was ready before, but it becomes not ready now, i.e. the - # readiness probe fails. - NOT_READY = 'NOT_READY' - - # The replica VM is being shut down. i.e., the `sky down` is still running. - SHUTTING_DOWN = 'SHUTTING_DOWN' - - # The replica VM is once failed and has been deleted. - FAILED = 'FAILED' - - # `sky.down` failed during service teardown. This could mean resource - # leakage. - FAILED_CLEANUP = 'FAILED_CLEANUP' - - # Unknown status. This should never happen. - UNKNOWN = 'UNKNOWN' - - @classmethod - def failed_statuses(cls): - return [cls.FAILED, cls.FAILED_CLEANUP, cls.UNKNOWN] - - def colored_str(self): - color = _REPLICA_STATUS_TO_COLOR[self] - return f'{color}{self.value}{colorama.Style.RESET_ALL}' - - -_REPLICA_STATUS_TO_COLOR = { - ReplicaStatus.PROVISIONING: colorama.Fore.BLUE, - ReplicaStatus.STARTING: colorama.Fore.CYAN, - ReplicaStatus.READY: colorama.Fore.GREEN, - ReplicaStatus.NOT_READY: colorama.Fore.YELLOW, - ReplicaStatus.FAILED_CLEANUP: colorama.Fore.RED, - ReplicaStatus.SHUTTING_DOWN: colorama.Fore.MAGENTA, - ReplicaStatus.FAILED: colorama.Fore.RED, - ReplicaStatus.UNKNOWN: colorama.Fore.RED, -} diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index af376f090cf..ef7d21b006b 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -7,6 +7,7 @@ import colorama from sky import backends +from sky import serve from sky import spot from sky import status_lib from sky.backends import backend_utils @@ -15,7 +16,6 @@ if typing.TYPE_CHECKING: import sky - from sky import serve COMMAND_TRUNC_LENGTH = 25 REPLICA_TRUNC_NUM = 10 @@ -393,8 +393,7 @@ def show_local_status_table(local_clusters: List[str]): _get_service_name = (lambda replica_record: replica_record['service_name']) -def _get_service_handle( - service_record: _ServiceRecord) -> 'serve.ServiceHandle': +def _get_service_handle(service_record: _ServiceRecord) -> serve.ServiceHandle: return service_record['handle'] @@ -423,11 +422,11 @@ def _get_replicas(service_record: _ServiceRecord) -> str: ready_replica_num, total_replica_num = 0, 0 auto_restart = _get_service_handle(service_record).auto_restart for info in service_record['replica_info']: - if _get_status(info) == status_lib.ReplicaStatus.READY: + if _get_status(info) == serve.ReplicaStatus.READY: ready_replica_num += 1 # If auto restart enabled, not count FAILED replicas here. if (not auto_restart or - _get_status(info) != status_lib.ReplicaStatus.FAILED): + _get_status(info) != serve.ReplicaStatus.FAILED): total_replica_num += 1 return f'{ready_replica_num}/{total_replica_num}' @@ -448,8 +447,7 @@ def _get_display_endpoint(service_record: _ServiceRecord) -> str: return endpoint -def _get_service_status( - service_record: _ServiceRecord) -> status_lib.ServiceStatus: +def _get_service_status(service_record: _ServiceRecord) -> serve.ServiceStatus: return service_record['status'] From e97381e2ab9af323df00df789041d54eb84986a0 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Oct 2023 14:20:02 -0700 Subject: [PATCH 110/223] minor --- sky/cli.py | 9 +++++---- sky/serve/infra_providers.py | 7 ++++--- sky/task.py | 4 +--- tests/skyserve/replica_failure/server.py | 10 +++++----- tests/skyserve/replica_failure/service.yaml | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index ea5a419b47e..aa8d01999ac 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1146,7 +1146,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides( f'If you see this, please file an issue; tasks: {dag.tasks}') task = dag.tasks[0] else: - task = sky.Task(name=sky.Task.CLI_CMD_TASK_NAME, run=entrypoint) + task = sky.Task(name='sky-cmd', run=entrypoint) task.set_resources({sky.Resources()}) # Override. @@ -4105,14 +4105,15 @@ def serve_up( with ux_utils.print_exception_no_traceback(): raise RuntimeError(prompt) + is_yaml, _ = _check_yaml(''.join(entrypoint)) + if not is_yaml: + raise click.UsageError( + 'For `sky serve up`, the entrypoint must be a YAML file.') task = _make_task_or_dag_from_entrypoint_with_overrides( entrypoint, entrypoint_name='Service') if isinstance(task, sky.Dag): raise click.UsageError( _DAG_NOT_SUPPORT_MESSAGE.format(command='sky serve up')) - if task.name == sky.Task.CLI_CMD_TASK_NAME: - raise click.UsageError( - 'For `sky serve up`, the entrypoint must be a YAML file.') if task.service is None: with ux_utils.print_exception_no_traceback(): diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index a9f0660be92..b04c404d7c1 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -333,10 +333,11 @@ def _fetch_job_status(self) -> None: backend = backends.CloudVmRayBackend() handle = info.handle assert handle is not None, info - # Only fetch job 1, which stands for user task job - job_statuses = backend.get_job_status(handle, [1], + # Use None to fetch latest job, which stands for user task job + job_statuses = backend.get_job_status(handle, + None, stream_logs=False) - job_status = job_statuses[1] + job_status = list(job_statuses.values())[0] if job_status in [ job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP ]: diff --git a/sky/task.py b/sky/task.py index c67ee5adab1..98cb13ce855 100644 --- a/sky/task.py +++ b/sky/task.py @@ -162,8 +162,6 @@ def _add_docker_login_config(resources: 'resources_lib.Resources'): class Task: """Task: a computation to be run on the cloud.""" - CLI_CMD_TASK_NAME = 'sky-cmd' - def __init__( self, name: Optional[str] = None, @@ -1043,7 +1041,7 @@ def __repr__(self): run_msg = 'run=' name_str = '' - if self.name is not None and self.name != self.CLI_CMD_TASK_NAME: + if self.name is not None: name_str = f'' s = f'Task{name_str}({run_msg})' if self.inputs is not None: diff --git a/tests/skyserve/replica_failure/server.py b/tests/skyserve/replica_failure/server.py index f8378d980be..bf4ba747401 100644 --- a/tests/skyserve/replica_failure/server.py +++ b/tests/skyserve/replica_failure/server.py @@ -13,18 +13,18 @@ def get_self_ip() -> str: return requests.get('http://ifconfig.me').text -@app.get("/get_ip") +@app.get('/get_ip') async def get_ip(): return {'ip': get_self_ip()} -@app.get("/health") +@app.get('/health') async def health(): - return {"status": "ok"} + return {'status': 'ok'} -if __name__ == "__main__": +if __name__ == '__main__': parser = argparse.ArgumentParser(description='SkyServe HTTP Test Server') parser.add_argument('--port', type=int, required=True) args = parser.parse_args() - uvicorn.run(app, host="0.0.0.0", port=args.port) + uvicorn.run(app, host='0.0.0.0', port=args.port) diff --git a/tests/skyserve/replica_failure/service.yaml b/tests/skyserve/replica_failure/service.yaml index 1f12e139ed1..096dc8cdf34 100644 --- a/tests/skyserve/replica_failure/service.yaml +++ b/tests/skyserve/replica_failure/service.yaml @@ -3,7 +3,7 @@ resources: zone: us-central1-a cpus: 2+ -workdir: tests/skyserve/interrupt +workdir: tests/skyserve/replica_failure setup: pip install fastapi[all] uvicorn From e55958db224c9484159b0544d708b0d12b7fb0b4 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Oct 2023 15:24:32 -0700 Subject: [PATCH 111/223] speedup terminate --- sky/serve/controller.py | 19 +++++-------------- sky/serve/infra_providers.py | 31 ++++++++++++++++++++----------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 26423b41b4b..dc3964283dd 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -9,7 +9,6 @@ import pickle import threading import time -from typing import Optional import fastapi import uvicorn @@ -44,10 +43,8 @@ class SkyServeController: - Providing the HTTP Server API for SkyServe to communicate with. """ - def __init__(self, - port: int, - infra_provider: infra_providers.InfraProvider, - autoscaler: Optional[autoscalers.Autoscaler] = None) -> None: + def __init__(self, port: int, infra_provider: infra_providers.InfraProvider, + autoscaler: autoscalers.Autoscaler) -> None: self.port = port self.infra_provider = infra_provider self.autoscaler = autoscaler @@ -123,9 +120,8 @@ def terminate(request: fastapi.Request): logger.info('Terminating service...') serve_state.set_status(self.infra_provider.service_name, serve_state.ServiceStatus.SHUTTING_DOWN) - if self.autoscaler is not None: - logger.info('Terminate autoscaler...') - self.autoscaler.terminate() + logger.info('Terminate autoscaler...') + self.autoscaler.terminate() msg = self.infra_provider.terminate() if msg is None: # We cannot terminate the controller now because we still @@ -133,12 +129,7 @@ def terminate(request: fastapi.Request): self.terminating = True return {'message': msg} - # Run replica_prober and autoscaler (if autoscaler is defined) - # in separate threads in the background. - # This should not block the main thread. - self.infra_provider.start_replica_prober() - if self.autoscaler is not None: - self.autoscaler.start() + self.autoscaler.start() # Start a daemon to check if the controller is terminating, and if so, # shutdown the controller so the skypilot jobs will finish, thus enable diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index b04c404d7c1..b393ca0a38c 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -233,10 +233,6 @@ def terminate(self) -> Optional[str]: # Terminate service raise NotImplementedError - def start_replica_prober(self) -> None: - # Start the replica fetcher thread - raise NotImplementedError - class SkyPilotInfraProvider(InfraProvider): """Infra provider for SkyPilot clusters.""" @@ -252,6 +248,7 @@ def __init__(self, task_yaml_path: str, *args, **kwargs) -> None: self._start_process_pool_refresher() self._start_job_status_fetcher() + self._start_replica_prober() # This process periodically checks all sky.launch and sky.down process # on the fly. If any of them finished, it will update the status of @@ -308,7 +305,7 @@ def _refresh_process_pool(self) -> None: # TODO(tian): Maybe use decorator? def _process_pool_refresher(self) -> None: - while not self.process_pool_refresher_stop_event.is_set(): + while True: logger.info('Refreshing process pool.') try: self._refresh_process_pool() @@ -316,7 +313,11 @@ def _process_pool_refresher(self) -> None: # No matter what error happens, we should keep the # process pool refresher running. logger.error(f'Error in process pool refresher: {e}') - time.sleep(_PROCESS_POOL_REFRESH_INTERVAL) + for _ in range(_PROCESS_POOL_REFRESH_INTERVAL): + if self.process_pool_refresher_stop_event.is_set(): + logger.info('Process pool refresher terminated.') + return + time.sleep(1) def _start_process_pool_refresher(self) -> None: self.process_pool_refresher_stop_event = threading.Event() @@ -355,7 +356,7 @@ def _fetch_job_status(self) -> None: self._teardown_cluster(cluster_name, sync_down_logs=True) def _job_status_fetcher(self) -> None: - while not self.job_status_fetcher_stop_event.is_set(): + while True: logger.info('Refreshing job status.') try: self._fetch_job_status() @@ -363,7 +364,11 @@ def _job_status_fetcher(self) -> None: # No matter what error happens, we should keep the # job status fetcher running. logger.error(f'Error in job status fetcher: {e}') - time.sleep(_JOB_STATUS_FETCH_INTERVAL) + for _ in range(_JOB_STATUS_FETCH_INTERVAL): + if self.job_status_fetcher_stop_event.is_set(): + logger.info('Job status fetcher terminated.') + return + time.sleep(1) def _start_job_status_fetcher(self) -> None: self.job_status_fetcher_stop_event = threading.Event() @@ -546,7 +551,7 @@ def terminate(self) -> Optional[str]: return '\n'.join(msg) def _replica_prober(self) -> None: - while not self.replica_prober_stop_event.is_set(): + while True: logger.info('Running replica prober.') try: self._probe_all_replicas() @@ -556,9 +561,13 @@ def _replica_prober(self) -> None: # No matter what error happens, we should keep the # replica prober running. logger.error(f'Error in replica prober: {e}') - time.sleep(_ENDPOINT_PROBE_INTERVAL) + for _ in range(_ENDPOINT_PROBE_INTERVAL): + if self.replica_prober_stop_event.is_set(): + logger.info('Replica prober terminated.') + return + time.sleep(1) - def start_replica_prober(self) -> None: + def _start_replica_prober(self) -> None: self.replica_prober_stop_event = threading.Event() self.replica_prober_thread = threading.Thread( target=self._replica_prober) From a952249421ea136cb3261152d343a216776bf79a Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 12 Oct 2023 00:29:33 -0700 Subject: [PATCH 112/223] replica db mypy & pylint passed --- sky/backends/backend_utils.py | 43 ++++ sky/serve/controller.py | 5 +- sky/serve/infra_providers.py | 400 +++++++++++++++++++--------------- sky/serve/serve_state.py | 65 +++++- sky/serve/serve_utils.py | 6 +- sky/spot/controller.py | 38 +--- 6 files changed, 342 insertions(+), 215 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index f373889262e..3349e08ff55 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2838,6 +2838,49 @@ def _refresh_service(service_name: str) -> Optional[Dict[str, Any]]: return [record for record in updated_records if record is not None] +# Internal only: +def download_and_stream_latest_job_log( + backend: 'cloud_vm_ray_backend.CloudVmRayBackend', + handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle', local_dir: str, + log_position_hint: str, log_finish_hint: str) -> None: + """Downloads and streams the latest job log. + + This function is only used by spot controller and sky serve controller. + """ + os.makedirs(local_dir, exist_ok=True) + try: + log_dirs = backend.sync_down_logs( + handle, + # Download the log of the latest job. + # The job_id for the spot job running on the spot cluster is not + # necessarily 1, as it is possible that the worker node in a + # multi-node cluster is preempted, and we recover the spot job + # on the existing cluster, which leads to a larger job_id. Those + # job_ids all represent the same logical spot job. + job_ids=None, + local_dir=local_dir) + except exceptions.CommandError as e: + logger.info(f'Failed to download the logs: ' + f'{common_utils.format_exception(e)}') + else: + if not log_dirs: + logger.error('Failed to find the logs for the user program in ' + f'the {log_position_hint}.') + else: + log_dir = list(log_dirs.values())[0] + log_file = os.path.join(log_dir, 'run.log') + + # Print the logs to the console. + try: + with open(log_file) as f: + print(f.read()) + except FileNotFoundError: + logger.error('Failed to find the logs for the user ' + f'program at {log_file}.') + else: + logger.info(f'\n== End of logs ({log_finish_hint}) ==') + + @typing.overload def get_backend_from_handle( handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle' diff --git a/sky/serve/controller.py b/sky/serve/controller.py index dc3964283dd..430edf99f71 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -118,8 +118,9 @@ def get_latest_info(): def terminate(request: fastapi.Request): del request logger.info('Terminating service...') - serve_state.set_status(self.infra_provider.service_name, - serve_state.ServiceStatus.SHUTTING_DOWN) + serve_state.set_service_status( + self.infra_provider.service_name, + serve_state.ServiceStatus.SHUTTING_DOWN) logger.info('Terminate autoscaler...') self.autoscaler.terminate() msg = self.infra_provider.terminate() diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index b393ca0a38c..4e8269f61b0 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -2,8 +2,9 @@ from concurrent import futures import dataclasses import enum +import functools import logging -import random +import os import signal import subprocess import threading @@ -15,9 +16,11 @@ from sky import backends from sky import global_user_state -from sky.serve import constants +from sky.backends import backend_utils +from sky.serve import constants as serve_constants from sky.serve import serve_state from sky.serve import serve_utils +from sky.skylet import constants from sky.skylet import job_lib from sky.utils import env_options @@ -43,6 +46,16 @@ def _interrupt_process_and_children(pid: int) -> None: pass +def with_lock(func): + + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + with self.lock: + return func(self, *args, **kwargs) + + return wrapper + + class ProcessStatus(enum.Enum): """Process status.""" @@ -119,7 +132,7 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: # sky.launch failed return serve_state.ReplicaStatus.FAILED # This indicate it is a scale_down with correct teardown. - # Should have been cleaned from the replica_info. + # Should have been cleaned from the replica table. return serve_state.ReplicaStatus.UNKNOWN if self.sky_launch_status == ProcessStatus.FAILED: # sky.launch failed @@ -143,6 +156,7 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: return serve_state.ReplicaStatus.STARTING +# TODO(tian): Maybe rename it to Replica class ReplicaInfo: """Replica info for each replica.""" @@ -150,7 +164,7 @@ def __init__(self, replica_id: int, cluster_name: str) -> None: self.replica_id: int = replica_id self.cluster_name: str = cluster_name self.first_not_ready_time: Optional[float] = None - self.consecutive_failure_times: List[int] = [] + self.consecutive_failure_times: List[float] = [] self.status_property: ReplicaStatusProperty = ReplicaStatusProperty() @property @@ -188,6 +202,42 @@ def to_info_dict(self, with_handle: bool) -> Dict[str, Any]: info_dict['handle'] = self.handle return info_dict + def probe( + self, readiness_suffix: str, post_data: Optional[Union[str, Dict[str, + Any]]] + ) -> Tuple['ReplicaInfo', bool]: + replica_ip = self.ip + try: + msg = '' + # TODO(tian): Support HTTPS in the future. + readiness_path = f'http://{replica_ip}{readiness_suffix}' + if post_data is not None: + msg += 'Post' + response = requests.post( + readiness_path, + json=post_data, + timeout=serve_constants.READINESS_PROBE_TIMEOUT) + else: + msg += 'Get' + response = requests.get( + readiness_path, + timeout=serve_constants.READINESS_PROBE_TIMEOUT) + msg += (f' request to {replica_ip} returned status code ' + f'{response.status_code}') + if response.status_code == 200: + msg += '.' + else: + msg += f' and response {response.text}.' + logger.info(msg) + if response.status_code == 200: + logger.info(f'Replica {replica_ip} is ready.') + return self, True + except requests.exceptions.RequestException as e: + logger.info(e) + logger.info(f'Replica {replica_ip} is not ready.') + pass + return self, False + class InfraProvider: """Each infra provider manages one service.""" @@ -198,13 +248,13 @@ def __init__( readiness_suffix: str, initial_delay_seconds: int, post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: + self.lock = threading.Lock() + self.next_replica_id: int = 1 self.service_name: str = service_name self.readiness_suffix: str = readiness_suffix self.initial_delay_seconds: int = initial_delay_seconds self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data self.uptime: Optional[float] = None - self.replica_info: serve_utils.ThreadSafeDict[ - str, ReplicaInfo] = serve_utils.ThreadSafeDict() logger.info(f'Readiness probe suffix: {self.readiness_suffix}') logger.info(f'Initial delay seconds: {self.initial_delay_seconds}') logger.info(f'Post data: {self.post_data} ({type(self.post_data)})') @@ -240,11 +290,10 @@ class SkyPilotInfraProvider(InfraProvider): def __init__(self, task_yaml_path: str, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.task_yaml_path: str = task_yaml_path - self.next_replica_id: int = 1 self.launch_process_pool: serve_utils.ThreadSafeDict[ - str, subprocess.Popen] = serve_utils.ThreadSafeDict() + int, subprocess.Popen] = serve_utils.ThreadSafeDict() self.down_process_pool: serve_utils.ThreadSafeDict[ - str, subprocess.Popen] = serve_utils.ThreadSafeDict() + int, subprocess.Popen] = serve_utils.ThreadSafeDict() self._start_process_pool_refresher() self._start_job_status_fetcher() @@ -253,35 +302,44 @@ def __init__(self, task_yaml_path: str, *args, **kwargs) -> None: # This process periodically checks all sky.launch and sky.down process # on the fly. If any of them finished, it will update the status of # the corresponding replica. + @with_lock def _refresh_process_pool(self) -> None: - for cluster_name, p in list(self.launch_process_pool.items()): + for replica_id, p in list(self.launch_process_pool.items()): if p.poll() is not None: # TODO(tian): Try-catch in process, and have an enum return # value to indicate which type of failure happened. # Currently we only have user code failure since the # retry_until_up flag is set to True, but it will be helpful # when we enable user choose whether to retry or not. - logger.info(f'Launch process for {cluster_name} finished.') - del self.launch_process_pool[cluster_name] - info = self.replica_info[cluster_name] + logger.info( + f'Launch process for replica {replica_id} finished.') + del self.launch_process_pool[replica_id] + info = serve_state.get_replica_info_from_id( + self.service_name, replica_id) + assert info is not None if p.returncode != 0: - logger.warning(f'Launch process for {cluster_name} exited ' - f'abnormally with code {p.returncode}. ' - 'Terminating...') + logger.warning( + f'Launch process for replica {replica_id} exited ' + f'abnormally with code {p.returncode}. Terminating...') info.status_property.sky_launch_status = ( ProcessStatus.FAILED) - self._teardown_cluster(cluster_name, sync_down_logs=True) + self._teardown_replica(replica_id, sync_down_logs=True) else: info.status_property.sky_launch_status = ( ProcessStatus.SUCCEEDED) - for cluster_name, p in list(self.down_process_pool.items()): + serve_state.add_or_update_replica(self.service_name, replica_id, + info) + for replica_id, p in list(self.down_process_pool.items()): if p.poll() is not None: - logger.info(f'Down process for {cluster_name} finished.') - del self.down_process_pool[cluster_name] - info = self.replica_info[cluster_name] + logger.info(f'Down process for replica {replica_id} finished.') + del self.down_process_pool[replica_id] + info = serve_state.get_replica_info_from_id( + self.service_name, replica_id) + assert info is not None if p.returncode != 0: - logger.error(f'Down process for {cluster_name} exited ' - f'abnormally with code {p.returncode}.') + logger.error( + f'Down process for replica {replica_id} exited ' + f'abnormally with code {p.returncode}.') info.status_property.sky_down_status = ( ProcessStatus.FAILED) else: @@ -295,13 +353,15 @@ def _refresh_process_pool(self) -> None: # This means the cluster is deleted due to # a scale down. Delete the replica info # so it won't count as a replica. - del self.replica_info[cluster_name] - logger.info(f'Cluster {cluster_name} removed from the ' - 'replica info normally.') + logger.info(f'Replica {replica_id} removed from the ' + 'replica table normally.') + serve_state.remove_replica(self.service_name, replica_id) else: - logger.info(f'Termination of cluster {cluster_name} ' + logger.info(f'Termination of replica {replica_id} ' 'finished. Replica info is kept since some ' 'failure detected.') + serve_state.add_or_update_replica(self.service_name, + replica_id, info) # TODO(tian): Maybe use decorator? def _process_pool_refresher(self) -> None: @@ -325,8 +385,10 @@ def _start_process_pool_refresher(self) -> None: target=self._process_pool_refresher) self.process_pool_refresher_thread.start() + @with_lock def _fetch_job_status(self) -> None: - for cluster_name, info in self.replica_info.items(): + infos = serve_state.get_replica_infos(self.service_name) + for info in infos: if not info.status_property.should_track_status(): continue # We use backend API to avoid usage collection in the @@ -343,17 +405,21 @@ def _fetch_job_status(self) -> None: job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP ]: info.status_property.user_app_failed = True - logger.warning(f'User APP for cluster {cluster_name} FAILED. ' - 'Start streaming logs...') - # Always tail the logs of the first job, which represent user - # setup & run. - try: - backend.tail_logs(handle, job_id=1, follow=False) - except Exception as e: # pylint: disable=broad-except - logger.error(f'Error in streaming logs for cluster ' - f'{cluster_name}: {e}') + serve_state.add_or_update_replica(self.service_name, + info.replica_id, info) + logger.warning( + f'User APP for replica {info.replica_id} FAILED. ' + 'Start streaming logs...') + replica_job_logs_dir = os.path.join( + constants.SKY_LOGS_DIRECTORY, 'replica_jobs') + backend_utils.download_and_stream_latest_job_log( + backend, + handle, + replica_job_logs_dir, + log_position_hint='replica cluster', + log_finish_hint=f'Replica: {info.replica_id}') logger.info('Terminating...') - self._teardown_cluster(cluster_name, sync_down_logs=True) + self._teardown_replica(info.replica_id, sync_down_logs=True) def _job_status_fetcher(self) -> None: while True: @@ -387,33 +453,33 @@ def _terminate_daemon_threads(self) -> None: def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: return [ info.to_info_dict(with_handle=verbose) - for info in self.replica_info.values() + for info in serve_state.get_replica_infos(self.service_name) ] def total_replica_num(self, count_failed_replica: bool) -> int: + infos = serve_state.get_replica_infos(self.service_name) if count_failed_replica: - return len(self.replica_info) - return len([ - i for i in self.replica_info.values() - if i.status != serve_state.ReplicaStatus.FAILED - ]) + return len(infos) + return len( + [i for i in infos if i.status != serve_state.ReplicaStatus.FAILED]) def get_ready_replicas(self) -> Set[str]: ready_replicas = set() - for info in self.replica_info.values(): + infos = serve_state.get_replica_infos(self.service_name) + for info in infos: if info.status == serve_state.ReplicaStatus.READY: assert info.ip is not None ready_replicas.add(info.ip) return ready_replicas - def _launch_cluster(self, replica_id: int) -> None: + def _launch_replica(self, replica_id: int) -> None: cluster_name = serve_utils.generate_replica_cluster_name( self.service_name, replica_id) - if cluster_name in self.launch_process_pool: - logger.warning(f'Launch process for cluster {cluster_name} ' + if replica_id in self.launch_process_pool: + logger.warning(f'Launch process for replica {replica_id} ' 'already exists. Skipping.') return - logger.info(f'Creating SkyPilot cluster {cluster_name}') + logger.info(f'Creating replica {replica_id}') # TODO(tian): We should do usage_lib.messages.usage.set_internal() # after we change to python API. cmd = ['sky', 'launch', self.task_yaml_path, '-c', cluster_name, '-y'] @@ -426,30 +492,28 @@ def _launch_cluster(self, replica_id: int) -> None: stdin=subprocess.DEVNULL, stdout=f, stderr=f) - self.launch_process_pool[cluster_name] = p - assert cluster_name not in self.replica_info - self.replica_info[cluster_name] = ReplicaInfo(replica_id, cluster_name) - - def _scale_up(self, n: int) -> None: - # Launch n new clusters - for _ in range(0, n): - self._launch_cluster(self.next_replica_id) - self.next_replica_id += 1 + self.launch_process_pool[replica_id] = p + info = ReplicaInfo(replica_id, cluster_name) + serve_state.add_or_update_replica(self.service_name, replica_id, info) def scale_up(self, n: int) -> None: - self._scale_up(n) + # Launch n new replicas + for _ in range(n): + self._launch_replica(self.next_replica_id) + self.next_replica_id += 1 - def _teardown_cluster(self, - cluster_name: str, + def _teardown_replica(self, + replica_id: int, sync_down_logs: bool = True) -> None: - if cluster_name in self.down_process_pool: - logger.warning(f'Down process for cluster {cluster_name} already ' + if replica_id in self.down_process_pool: + logger.warning(f'Down process for replica {replica_id} already ' 'exists. Skipping.') return - replica_id = serve_utils.get_replica_id_from_cluster_name(cluster_name) if sync_down_logs: - logger.info(f'Syncing down logs for cluster {cluster_name}...') + logger.info(f'Syncing down logs for replica {replica_id}...') + # TODO(tian): Maybe use + # backend_utils.download_and_stream_latest_job_log here code = serve_utils.ServeCodeGen.stream_replica_logs( self.service_name, replica_id, @@ -464,13 +528,16 @@ def _teardown_cluster(self, except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should teardown the # cluster. - msg = ('Error in syncing down logs for cluster ' - f'{cluster_name}: {e}') + msg = ('Error in syncing down logs for replica ' + f'{replica_id}: {e}') logger.error(msg) print(msg, file=f) - logger.info(f'Deleting SkyPilot cluster {cluster_name}') - cmd = ['sky', 'down', cluster_name, '-y'] + logger.info(f'Deleting replica {replica_id}') + info = serve_state.get_replica_info_from_id(self.service_name, + replica_id) + assert info is not None + cmd = ['sky', 'down', info.cluster_name, '-y'] log_file_name = serve_utils.generate_replica_down_log_file_name( self.service_name, replica_id) with open(log_file_name, 'w') as f: @@ -479,33 +546,23 @@ def _teardown_cluster(self, stdin=subprocess.DEVNULL, stdout=f, stderr=f) - self.down_process_pool[cluster_name] = p - info = self.replica_info[cluster_name] + self.down_process_pool[replica_id] = p info.status_property.sky_down_status = ProcessStatus.RUNNING - - def _scale_down(self, n: int) -> None: - # Randomly delete n ready replicas - all_ready_replicas = self.get_ready_replicas() - num_replicas = len(all_ready_replicas) - if num_replicas > 0: - if n > num_replicas: - logger.warning( - f'Trying to delete {n} replicas, but only {num_replicas} ' - 'replicas exist. Deleting all replicas.') - n = num_replicas - cluster_to_terminate = random.sample(all_ready_replicas, n) - for cluster_name in cluster_to_terminate: - logger.info(f'Scaling down cluster {cluster_name}') - self._teardown_cluster(cluster_name) + serve_state.add_or_update_replica(self.service_name, replica_id, info) def scale_down(self, n: int) -> None: - self._scale_down(n) + # Terminate n replicas + # TODO(tian): Policy to choose replica to scale down. + infos = serve_state.get_replica_infos(self.service_name) + for i in range(n): + self._teardown_replica(infos[i].replica_id) + # TODO(tian): Maybe just kill all threads and cleanup using db record def terminate(self) -> Optional[str]: logger.info('Terminating infra provider daemon threads...') self._terminate_daemon_threads() logger.info('Terminating all clusters...') - for name, p in self.launch_process_pool.items(): + for replica_id, p in self.launch_process_pool.items(): # Use keyboard interrupt here since sky.launch has great # handling for it # Edge case: sky.launched finished after the @@ -516,19 +573,26 @@ def terminate(self) -> Optional[str]: # here since sky.launch has great handling for it. _interrupt_process_and_children(p.pid) p.wait() - logger.info(f'Interrupted launch process for cluster {name} ' - 'and deleted the cluster.') - self._teardown_cluster(name, sync_down_logs=False) - info = self.replica_info[name] + logger.info( + f'Interrupted launch process for replica {replica_id} ' + 'and deleted the cluster.') + self._teardown_replica(replica_id, sync_down_logs=False) + info = serve_state.get_replica_info_from_id( + self.service_name, replica_id) + assert info is not None # Set to success here for correctly display as shutting down info.status_property.sky_launch_status = ProcessStatus.SUCCEEDED + serve_state.add_or_update_replica(self.service_name, replica_id, + info) msg = [] - for name, info in self.replica_info.items(): + infos = serve_state.get_replica_infos(self.service_name) + # TODO(tian): Move all cleanup to the control process + for info in infos: if info.status in [ serve_state.ReplicaStatus.FAILED_CLEANUP, serve_state.ReplicaStatus.UNKNOWN, ]: - msg.append(f'Cluster with status {info.status} found. Please ' + msg.append(f'Replica with status {info.status} found. Please ' 'manually check the cloud console to make sure no ' 'resource leak.') # Skip those already deleted and those are deleting @@ -536,16 +600,19 @@ def terminate(self) -> Optional[str]: serve_state.ReplicaStatus.FAILED, serve_state.ReplicaStatus.SHUTTING_DOWN ]: - self._teardown_cluster(name, sync_down_logs=False) - for name, p in self.down_process_pool.items(): + self._teardown_replica(info.replica_id, sync_down_logs=False) + for replica_id, p in self.down_process_pool.items(): p.wait() - logger.info(f'Down process for cluster {name} finished.') + logger.info(f'Down process for replica {replica_id} finished.') if p.returncode != 0: - logger.warning(f'Down process for cluster {name} exited ' + logger.warning(f'Down process for replica {replica_id} exited ' f'abnormally with code {p.returncode}.') - msg.append(f'Down process for cluster {name} exited abnormally' - f' with code {p.returncode}. Please login to the ' - 'controller and make sure the cluster is released.') + msg.append( + f'Down process for replica {replica_id} exited abnormally' + f' with code {p.returncode}. Please login to the ' + 'controller and make sure the replica is released.') + else: + serve_state.remove_replica(self.service_name, replica_id) if not msg: return None return '\n'.join(msg) @@ -573,99 +640,80 @@ def _start_replica_prober(self) -> None: target=self._replica_prober) self.replica_prober_thread.start() + @with_lock def _probe_all_replicas(self) -> None: replica_info = self.get_replica_info( verbose=env_options.Options.SHOW_DEBUG_INFO.get()) logger.info(f'All replica info: {replica_info}') - def _probe_replica(info: ReplicaInfo) -> Tuple[str, bool]: - replica_ip = info.ip - try: - msg = '' - # TODO(tian): Support HTTPS in the future. - readiness_path = f'http://{replica_ip}{self.readiness_suffix}' - if self.post_data is not None: - msg += 'Post' - response = requests.post( - readiness_path, - json=self.post_data, - timeout=constants.READINESS_PROBE_TIMEOUT) - else: - msg += 'Get' - response = requests.get( - readiness_path, - timeout=constants.READINESS_PROBE_TIMEOUT) - msg += (f' request to {replica_ip} returned status code ' - f'{response.status_code}') - if response.status_code == 200: - msg += '.' - else: - msg += f' and response {response.text}.' - logger.info(msg) - if response.status_code == 200: - logger.info(f'Replica {replica_ip} is ready.') - if self.uptime is None: - self.uptime = time.time() - logger.info(f'Replica {replica_ip} is the first ' - 'ready replica. Setting uptime to ' - f'{self.uptime}.') - serve_state.set_uptime(self.service_name, - int(self.uptime)) - return info.cluster_name, True - except requests.exceptions.RequestException as e: - logger.info(e) - logger.info(f'Replica {replica_ip} is not ready.') - pass - return info.cluster_name, False - probe_futures = [] replica_to_probe = [] with futures.ThreadPoolExecutor() as executor: - for cluster_name, info in self.replica_info.items(): + infos = serve_state.get_replica_infos(self.service_name) + for info in infos: if not info.status_property.should_track_status(): continue replica_to_probe.append((info.cluster_name, info.ip)) - probe_futures.append(executor.submit(_probe_replica, info)) + probe_futures.append( + executor.submit(info.probe, self.readiness_suffix, + self.post_data)) logger.info(f'Replicas to probe: {replica_to_probe}') for future in futures.as_completed(probe_futures): - cluster_name, probe_succeeded = future.result() - info = self.replica_info[cluster_name] + future_result: Tuple[ReplicaInfo, bool] = future.result() + info, probe_succeeded = future_result info.status_property.service_ready_now = probe_succeeded + should_teardown = False if probe_succeeded: + if self.uptime is None: + self.uptime = time.time() + logger.info(f'Replica {info.replica_id} is the first ' + 'ready replica. Setting uptime to ' + f'{self.uptime}.') + serve_state.set_service_uptime(self.service_name, + int(self.uptime)) info.consecutive_failure_times.clear() if not info.status_property.service_once_ready: info.status_property.service_once_ready = True - continue - current_time = time.time() - if info.first_not_ready_time is None: - info.first_not_ready_time = current_time - if info.status_property.service_once_ready: - info.consecutive_failure_times.append(current_time) - consecutive_failure_time = (info.consecutive_failure_times[-1] - - info.consecutive_failure_times[0]) - if (consecutive_failure_time >= - _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT): - logger.info(f'Replica {cluster_name} is not ready for too ' - 'long and exceeding consecutive failure ' - 'threshold. Terminating the replica...') - self._teardown_cluster(cluster_name) - else: - logger.info(f'Replica {cluster_name} is not ready but ' - 'within consecutive failure threshold ' - f'({consecutive_failure_time}s / ' - f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' - 'Skipping.') else: - current_delay_seconds = current_time - info.first_not_ready_time - if current_delay_seconds > self.initial_delay_seconds: - logger.info(f'Replica {cluster_name} is not ready and ' - 'exceeding initial delay seconds. ' - 'Terminating the replica...') - self._teardown_cluster(cluster_name) + current_time = time.time() + if info.first_not_ready_time is None: + info.first_not_ready_time = current_time + if info.status_property.service_once_ready: + info.consecutive_failure_times.append(current_time) + consecutive_failure_time = ( + info.consecutive_failure_times[-1] - + info.consecutive_failure_times[0]) + if (consecutive_failure_time >= + _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT): + logger.info( + f'Replica {info.replica_id} is not ready for too ' + 'long and exceeding consecutive failure ' + 'threshold. Terminating the replica...') + should_teardown = True + else: + logger.info( + f'Replica {info.replica_id} is not ready but ' + 'within consecutive failure threshold ' + f'({consecutive_failure_time}s / ' + f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' + 'Skipping.') else: - current_delay_seconds = int(current_delay_seconds) - logger.info( - f'Replica {cluster_name} is not ready but within ' - f'initial delay seconds ({current_delay_seconds}s / ' - f'{self.initial_delay_seconds}s). Skipping.') + current_delay_seconds = (current_time - + info.first_not_ready_time) + if current_delay_seconds > self.initial_delay_seconds: + logger.info( + f'Replica {info.replica_id} is not ready and ' + 'exceeding initial delay seconds. Terminating ' + 'the replica...') + should_teardown = True + else: + current_delay_seconds = int(current_delay_seconds) + logger.info( + f'Replica {info.replica_id} is not ready but within' + f' initial delay seconds ({current_delay_seconds}s ' + f'/ {self.initial_delay_seconds}s). Skipping.') + serve_state.add_or_update_replica(self.service_name, + info.replica_id, info) + if should_teardown: + self._teardown_replica(info.replica_id) diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 3b5a80ae73b..2f5986aac64 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -2,7 +2,9 @@ import collections import enum import pathlib +import pickle import sqlite3 +import typing from typing import Any, Dict, List, Optional import colorama @@ -10,6 +12,9 @@ from sky.serve import constants from sky.utils import db_utils +if typing.TYPE_CHECKING: + from sky.serve import infra_providers + _DB_PATH = pathlib.Path(constants.SERVE_PREFIX) / 'services.db' _DB_PATH = _DB_PATH.expanduser().absolute() _DB_PATH.parents[0].mkdir(parents=True, exist_ok=True) @@ -27,9 +32,16 @@ controller_port INTEGER, status TEXT, uptime INTEGER DEFAULT NULL)""") +_CURSOR.execute("""\ + CREATE TABLE IF NOT EXISTS replicas ( + service_name TEXT, + replica_id INTEGER, + replica_info BLOB, + PRIMARY KEY (service_name, replica_id))""") _CONN.commit() +# === Statuses === class ReplicaStatus(enum.Enum): """Replica status.""" @@ -137,6 +149,7 @@ def from_replica_info( } +# === Service functions === def add_service(job_id: int, service_name: str, controller_port: int) -> None: """Adds a service to the database.""" with db_utils.safe_cursor(_DB_PATH) as cursor: @@ -155,7 +168,7 @@ def remove_service(service_name: str) -> None: DELETE FROM services WHERE name=(?)""", (service_name,)) -def set_uptime(service_name: str, uptime: int) -> None: +def set_service_uptime(service_name: str, uptime: int) -> None: """Sets the uptime of a service.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( @@ -164,7 +177,7 @@ def set_uptime(service_name: str, uptime: int) -> None: uptime=(?) WHERE name=(?)""", (uptime, service_name)) -def set_status(service_name: str, status: ServiceStatus) -> None: +def set_service_status(service_name: str, status: ServiceStatus) -> None: """Sets the service status.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( @@ -202,3 +215,51 @@ def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]: for row in rows: return _get_service_from_row(row) return None + + +# === Replica functions === +def add_or_update_replica(service_name: str, replica_id: int, + replica_info: 'infra_providers.ReplicaInfo') -> None: + """Adds a replica to the database.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute( + """\ + INSERT INTO replicas + (service_name, replica_id, replica_info) + VALUES (?, ?, ?)""", + (service_name, replica_id, pickle.dumps(replica_info))) + + +def remove_replica(service_name: str, replica_id: int) -> None: + """Removes a replica from the database.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute( + """\ + DELETE FROM replicas + WHERE service_name=(?) + AND replica_id=(?)""", (service_name, replica_id)) + + +def get_replica_info_from_id( + service_name: str, + replica_id: int) -> Optional['infra_providers.ReplicaInfo']: + """Gets a replica info from the database.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + rows = cursor.execute( + """\ + SELECT replica_info FROM replicas + WHERE service_name=(?) + AND replica_id=(?)""", (service_name, replica_id)).fetchall() + for row in rows: + return pickle.loads(row[0]) + return None + + +def get_replica_infos(service_name: str) -> List['infra_providers.ReplicaInfo']: + """Gets all replica infos of a service.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + rows = cursor.execute( + """\ + SELECT replica_info FROM replicas + WHERE service_name=(?)""", (service_name,)).fetchall() + return [pickle.loads(row[0]) for row in rows] diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index e6824780527..62fa4619add 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -315,7 +315,7 @@ def set_service_status_from_replica_info( # terminated, the service status will still be READY, but we don't want # change service status to READY. return - serve_state.set_status( + serve_state.set_service_status( service_name, serve_state.ServiceStatus.from_replica_info(replica_info)) @@ -328,8 +328,8 @@ def update_service_status() -> None: controller_status = job_lib.get_status(record['controller_job_id']) if controller_status is None or controller_status.is_terminal(): # If controller job is not running, set it as controller failed. - serve_state.set_status(record['name'], - serve_state.ServiceStatus.CONTROLLER_FAILED) + serve_state.set_service_status( + record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED) class ServiceHandle(object): diff --git a/sky/spot/controller.py b/sky/spot/controller.py index 7bd68f5a07d..287b4a8610a 100644 --- a/sky/spot/controller.py +++ b/sky/spot/controller.py @@ -86,38 +86,12 @@ def _download_log_and_stream( """ spot_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, 'spot_jobs') - os.makedirs(spot_job_logs_dir, exist_ok=True) - try: - log_dirs = self._backend.sync_down_logs( - handle, - # Download the log of the latest job. - # The job_id for the spot job running on the spot cluster is not - # necessarily 1, as it is possible that the worker node in a - # multi-node cluster is preempted, and we recover the spot job - # on the existing cluster, which leads to a larger job_id. Those - # job_ids all represent the same logical spot job. - job_ids=None, - local_dir=spot_job_logs_dir) - except exceptions.CommandError as e: - logger.info(f'Failed to download the logs: ' - f'{common_utils.format_exception(e)}') - else: - if not log_dirs: - logger.error('Failed to find the logs for the user program in ' - 'the spot cluster.') - else: - log_dir = list(log_dirs.values())[0] - log_file = os.path.join(log_dir, 'run.log') - - # Print the logs to the console. - try: - with open(log_file) as f: - print(f.read()) - except FileNotFoundError: - logger.error('Failed to find the logs for the user ' - f'program at {log_file}.') - else: - logger.info(f'\n== End of logs (ID: {self._job_id}) ==') + backend_utils.download_and_stream_latest_job_log( + self._backend, + handle, + spot_job_logs_dir, + log_position_hint='spot cluster', + log_finish_hint=f'ID: {self._job_id}') def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool: """Busy loop monitoring spot cluster status and handling recovery. From 688bb20edcac461e355a7ca53ab6c8020280066d Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 12 Oct 2023 11:23:50 -0700 Subject: [PATCH 113/223] fix --- sky/serve/serve_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 2f5986aac64..f268d3153b3 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -224,7 +224,7 @@ def add_or_update_replica(service_name: str, replica_id: int, with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( """\ - INSERT INTO replicas + INSERT OR REPLACE INTO replicas (service_name, replica_id, replica_info) VALUES (?, ?, ?)""", (service_name, replica_id, pickle.dumps(replica_info))) From 09fd88e0f38ee153a26fe6e2b05c2160783b7a2e Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 12 Oct 2023 11:45:52 -0700 Subject: [PATCH 114/223] fix scale down & cleanup failed replica --- sky/serve/infra_providers.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 4e8269f61b0..4f6c5836f12 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -554,7 +554,11 @@ def scale_down(self, n: int) -> None: # Terminate n replicas # TODO(tian): Policy to choose replica to scale down. infos = serve_state.get_replica_infos(self.service_name) - for i in range(n): + if len(infos) < n: + logger.error(f'Cannot scale down {n} replicas since there are ' + f'only {len(infos)} replicas. Scale down all ' + 'replicas instead.') + for i in range(min(n, len(infos))): self._teardown_replica(infos[i].replica_id) # TODO(tian): Maybe just kill all threads and cleanup using db record @@ -613,6 +617,16 @@ def terminate(self) -> Optional[str]: 'controller and make sure the replica is released.') else: serve_state.remove_replica(self.service_name, replica_id) + infos = serve_state.get_replica_infos(self.service_name) + for info in infos: + if not info.status in serve_state.ReplicaStatus.failed_statuses(): + # This should not happen since we already teardown all + # replicas. Here we just add a double check. + msg.append(f'Replica {info.replica_id} is not deleted. ' + 'Please login to the controller and make sure ' + 'the replica is released.') + else: + serve_state.remove_replica(self.service_name, info.replica_id) if not msg: return None return '\n'.join(msg) From f13e3efaf45303dcd5cea57a072f2418d0e93c69 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 12 Oct 2023 12:13:26 -0700 Subject: [PATCH 115/223] minor --- sky/serve/infra_providers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 4f6c5836f12..5e6f8cffc9e 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -612,8 +612,8 @@ def terminate(self) -> Optional[str]: logger.warning(f'Down process for replica {replica_id} exited ' f'abnormally with code {p.returncode}.') msg.append( - f'Down process for replica {replica_id} exited abnormally' - f' with code {p.returncode}. Please login to the ' + f'Down process for replica {replica_id} exited abnormally ' + f'with code {p.returncode}. Please login to the ' 'controller and make sure the replica is released.') else: serve_state.remove_replica(self.service_name, replica_id) From aeca540ecf0becc53c4af9fc89864747216a2912 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 12 Oct 2023 15:58:11 -0700 Subject: [PATCH 116/223] move controller resources to config.yaml --- sky/core.py | 6 +- sky/execution.py | 31 ++++--- sky/serve/serve_utils.py | 99 ++++++++------------- sky/serve/service_spec.py | 11 --- sky/utils/schemas.py | 4 - tests/conftest.py | 9 ++ tests/skyserve/auto_restart.yaml | 2 - tests/skyserve/http/aws.yaml | 2 - tests/skyserve/http/azure.yaml | 2 - tests/skyserve/http/gcp.yaml | 2 - tests/skyserve/http/mixed_cloud.yaml | 16 ---- tests/skyserve/llm/service.yaml | 2 - tests/skyserve/replica_failure/service.yaml | 2 - tests/test_smoke.py | 18 ++-- 14 files changed, 77 insertions(+), 129 deletions(-) delete mode 100644 tests/skyserve/http/mixed_cloud.yaml diff --git a/sky/core.py b/sky/core.py index 439c8463591..89e21148b75 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1236,9 +1236,9 @@ def serve_down(service_name: str, purge: bool = False) -> None: # pylint: disable=broad-except except Exception as e: if purge: - logger.warning('Ignoring error when stopping controller and ' - f'load balancer jobs of service {service_name!r}: ' - f'{common_utils.format_exception(e)}') + logger.warning( + f'Ignoring error when clean up service {service_name!r}: ' + f'{common_utils.format_exception(e)}') else: with ux_utils.print_exception_no_traceback(): raise RuntimeError(e) from e diff --git a/sky/execution.py b/sky/execution.py index 45d5555783f..e4d4b487aea 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -63,6 +63,11 @@ sky.spot_launch(task, ...) """.strip() +_CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = ( + '{controller_type} controller resources is not valid, please check ' + '~/.sky/config.yaml file and make sure ' + '{controller_type}.controller.resources is a valid resources spec. ' + 'Details:\n {err}') def _convert_to_dag(entrypoint: Any) -> 'sky.Dag': @@ -737,12 +742,10 @@ def spot_launch( except ValueError as e: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Spot controller resources is not valid, please check ' - '~/.sky/config.yaml file and make sure ' - 'spot.controller.resources is a valid resources spec. ' - 'Details:\n' - f' {common_utils.format_exception(e, use_bracket=True)}' - ) from e + _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format( + controller_type='spot', + err=common_utils.format_exception( + e, use_bracket=True))) from e yaml_path = os.path.join(spot.SPOT_CONTROLLER_YAML_PREFIX, f'{name}-{dag_uuid}.yaml') @@ -986,8 +989,12 @@ def serve_up( raise RuntimeError('Service section not found.') controller_resources_config: Dict[str, Any] = copy.copy( serve.CONTROLLER_RESOURCES) - if task.service.controller_resources is not None: - controller_resources_config.update(task.service.controller_resources) + # Override the controller resources with the ones specified in the + # config. + custom_controller_resources_config = skypilot_config.get_nested( + ('serve', 'controller', 'resources'), None) + if custom_controller_resources_config is not None: + controller_resources_config.update(custom_controller_resources_config) if 'ports' in controller_resources_config: with ux_utils.print_exception_no_traceback(): raise ValueError('Cannot specify ports for controller resources.') @@ -997,7 +1004,10 @@ def serve_up( except ValueError as e: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Encountered error when parsing controller resources') from e + _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format( + controller_type='serve', + err=common_utils.format_exception(e, + use_bracket=True))) from e assert task.service is not None, task assert len(task.resources) == 1, task @@ -1021,8 +1031,7 @@ def serve_up( with filelock.FileLock( os.path.expanduser(serve.CONTROLLER_FILE_LOCK_PATH), serve.CONTROLLER_FILE_LOCK_TIMEOUT): - controller_name, _ = serve.get_available_controller_name( - controller_resources) + controller_name, _ = serve.get_available_controller_name() controller_port, load_balancer_port = ( serve.gen_ports_for_serve_process(controller_name)) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 62fa4619add..fc9e73d2ca6 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -89,7 +89,7 @@ def kill_children_and_self_processes() -> None: os.kill(os.getpid(), signal.SIGKILL) -def get_existing_controller_names() -> Set[str]: +def _get_existing_controller_names() -> Set[str]: """Get existing sky serve controller names. There is two possible indicators for a controller: @@ -198,109 +198,82 @@ def gen_ports_for_serve_process(controller_name: str) -> Tuple[int, int]: return controller_port, load_balancer_port -def _get_service_num_on_controller_if_available( - controller_name: str, - requested_controller_resources: 'sky.Resources') -> Optional[int]: - """Get number of services on the controller if it is available. +def _get_service_slot_on_controller(controller_name: str) -> int: + """Get the number of slots to run services on the controller. - A controller is available if requested controller resources is less - demanding than the controller resources, and have available slots for - services. Max number of services on a controller is determined by the memory - of the controller, since ray job and our skypilot code is very memory + A controller only have limited available slots for a new services. + Max number of slots on a controller is determined by the memory of + the controller, since ray job and our skypilot code is very memory demanding (~1GB/service). Args: controller_name: The name of the controller. - requested_controller_resources: The resources requested for controller. Returns: - Number of services on the controller if it is available, otherwise None. + Number of slots on the controller. """ - controller_available = False - max_memory_requirements = 0. + memory_requirements = 0. controller_record = global_user_state.get_cluster_from_name(controller_name) if controller_record is not None: # If controller is already created, use its launched resources. handle = controller_record['handle'] assert isinstance(handle, backends.CloudVmRayResourceHandle) - if requested_controller_resources.less_demanding_than( - handle.launched_resources): - controller_available = True # Determine max number of services on this controller. controller_cloud = handle.launched_resources.cloud - _, max_memory_requirements = ( + _, memory_requirements = ( controller_cloud.get_vcpus_mem_from_instance_type( handle.launched_resources.instance_type)) else: # Corner case: Multiple `sky serve up` are running simultaneously # and the controller is not created yet. We created a resources - # for each initializing controller, and find the most demanding - # one to represent the controller resources. + # for each initializing controller, and use the minimal memory + # requirement among them, since any of them could be the first to + # launch the controller. service_records = (global_user_state.get_services_from_controller_name( controller_name)) for service_record in service_records: r = service_record['handle'].requested_controller_resources - # If any service is more demanding than the requested resources, - # then the controller is available since it must be launched - # with the most demanding resources, which is more demanding - # than the requested resources. - if requested_controller_resources.less_demanding_than(r): - controller_available = True - # Don't break here since we still want to find the max - # memory requirements. # Remove the '+' in memory requirement. - max_memory_requirements = max(max_memory_requirements, - float(r.memory.strip('+'))) - if controller_available: - # Determine max number of services on this controller. - max_services_num = int(max_memory_requirements / - constants.SERVICES_MEMORY_USAGE_GB) - # Get current number of services on this controller. - services_num_on_controller = len( - global_user_state.get_services_from_controller_name( - controller_name)) - # Only consider controllers that have available slots for services. - if services_num_on_controller < max_services_num: - return services_num_on_controller - return None - - -def get_available_controller_name( - controller_resources: 'sky.Resources') -> Tuple[str, bool]: + memory_requirements = min(memory_requirements, + float(r.memory.strip('+'))) + # Determine max number of services on this controller. + max_services_num = int(memory_requirements / + constants.SERVICES_MEMORY_USAGE_GB) + # Get current number of services on this controller. + services_num_on_controller = len( + global_user_state.get_services_from_controller_name(controller_name)) + return max_services_num - services_num_on_controller + + +def get_available_controller_name() -> Tuple[str, bool]: """Get available controller name to use. - Only consider controllers that satisfy the requested controller resources, - and have available slots for services. + Only consider controllers that have available slots for services. If multiple controllers are available, choose the one with most number of services to decrease the number of controllers. This function needs to be called within a lock, to avoid concurrency issue from `existing_controllers` being staled, also, to avoid multiple `sky serve up` select the same last slot on a controller. - Args: - controller_resources: The resources requested for controller. - Returns: A tuple of controller name and a boolean value indicating whether the controller name is newly generated. """ # Get all existing controllers. - existing_controllers = get_existing_controller_names() - available_controller_to_service_num = dict() + existing_controllers = _get_existing_controller_names() + controller2slots = dict() # Get a mapping from controller name to number of services on it. for controller_name in existing_controllers: - services_num_on_controller = ( - _get_service_num_on_controller_if_available(controller_name, - controller_resources)) - if services_num_on_controller is not None: - available_controller_to_service_num[controller_name] = ( - services_num_on_controller) - if not available_controller_to_service_num: + num_slots = _get_service_slot_on_controller(controller_name) + # Only consider controllers that have available slots for services. + if num_slots > 0: + controller2slots[controller_name] = num_slots + if not controller2slots: return generate_controller_cluster_name(existing_controllers), True - # If multiple controllers are available, choose the one with most number of - # services. - return max(available_controller_to_service_num.keys(), - key=lambda k: available_controller_to_service_num[k]), False + # If multiple controllers are available, choose the one with least number of + # slots, i.e. most number of services. + return min(controller2slots.keys(), + key=lambda k: controller2slots[k]), False def set_service_status_from_replica_info( diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 03d94117c71..b2703826403 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -25,7 +25,6 @@ def __init__( qps_upper_threshold: Optional[float] = None, qps_lower_threshold: Optional[float] = None, post_data: Optional[Dict[str, Any]] = None, - controller_resources: Optional[Dict[str, Any]] = None, auto_restart: bool = False, ) -> None: if min_replicas < 0: @@ -54,7 +53,6 @@ def __init__( self._qps_upper_threshold = qps_upper_threshold self._qps_lower_threshold = qps_lower_threshold self._post_data = post_data - self._controller_resources = controller_resources self._auto_restart = auto_restart @staticmethod @@ -118,9 +116,6 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': service_config['auto_restart'] = policy_section.get( 'auto_restart', False) - service_config['controller_resources'] = config.pop( - 'controller_resources', None) - return SkyServiceSpec(**service_config) @staticmethod @@ -169,8 +164,6 @@ def add_if_not_none(section, key, value, no_empty: bool = False): add_if_not_none('replica_policy', 'qps_lower_threshold', self.qps_lower_threshold) add_if_not_none('replica_policy', 'auto_restart', self._auto_restart) - add_if_not_none('controller_resources', None, - self._controller_resources) return config @@ -232,10 +225,6 @@ def qps_lower_threshold(self) -> Optional[float]: def post_data(self) -> Optional[Dict[str, Any]]: return self._post_data - @property - def controller_resources(self) -> Optional[Dict[str, Any]]: - return self._controller_resources - @property def auto_restart(self) -> bool: return self._auto_restart diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 8c348470ec0..c70f28c3665 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -213,10 +213,6 @@ def get_service_schema(): 'replicas': { 'type': 'integer', }, - # resources config is validated separately using RESOURCES_SCHEMA - 'controller_resources': { - 'type': 'object', - }, } } diff --git a/tests/conftest.py b/tests/conftest.py index 6f3fb89d083..5105b8b0461 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,6 +57,10 @@ def pytest_addoption(parser): action='store_true', default=False, help='Only run tests for managed spot.') + parser.addoption('--sky-serve', + action='store_true', + default=False, + help='Only run tests for sky serve.') parser.addoption( '--generic-cloud', type=str, @@ -105,6 +109,8 @@ def pytest_collection_modifyitems(config, items): skip_marks['slow'] = pytest.mark.skip(reason='need --runslow option to run') skip_marks['managed_spot'] = pytest.mark.skip( reason='skipped, because --managed-spot option is set') + skip_marks['sky_serve'] = pytest.mark.skip( + reason='skipped, because --sky-serve option is set') for cloud in all_clouds_in_smoke_tests: skip_marks[cloud] = pytest.mark.skip( reason=f'tests for {cloud} is skipped, try setting --{cloud}') @@ -131,6 +137,9 @@ def pytest_collection_modifyitems(config, items): if (not 'managed_spot' in item.keywords) and config.getoption('--managed-spot'): item.add_marker(skip_marks['managed_spot']) + if (not 'sky_serve' + in item.keywords) and config.getoption('--sky-serve'): + item.add_marker(skip_marks['sky_serve']) # Check if tests need to be run serially for Kubernetes and Lambda Cloud # We run Lambda Cloud tests serially because Lambda Cloud rate limits its diff --git a/tests/skyserve/auto_restart.yaml b/tests/skyserve/auto_restart.yaml index baa9495b4bd..98c3d255e4e 100644 --- a/tests/skyserve/auto_restart.yaml +++ b/tests/skyserve/auto_restart.yaml @@ -15,5 +15,3 @@ service: replica_policy: min_replicas: 1 auto_restart: true - controller_resources: - cloud: gcp diff --git a/tests/skyserve/http/aws.yaml b/tests/skyserve/http/aws.yaml index 4482e28aeca..2faff262427 100644 --- a/tests/skyserve/http/aws.yaml +++ b/tests/skyserve/http/aws.yaml @@ -12,5 +12,3 @@ service: path: /health initial_delay_seconds: 20 replicas: 2 - controller_resources: - cloud: aws diff --git a/tests/skyserve/http/azure.yaml b/tests/skyserve/http/azure.yaml index 2b0faf26b7a..3c4c474a184 100644 --- a/tests/skyserve/http/azure.yaml +++ b/tests/skyserve/http/azure.yaml @@ -13,5 +13,3 @@ service: path: /health initial_delay_seconds: 20 replicas: 2 - controller_resources: - cloud: azure diff --git a/tests/skyserve/http/gcp.yaml b/tests/skyserve/http/gcp.yaml index 60ad9d26362..abefcc6563a 100644 --- a/tests/skyserve/http/gcp.yaml +++ b/tests/skyserve/http/gcp.yaml @@ -13,5 +13,3 @@ service: path: /health initial_delay_seconds: 20 replicas: 2 - controller_resources: - cloud: gcp diff --git a/tests/skyserve/http/mixed_cloud.yaml b/tests/skyserve/http/mixed_cloud.yaml deleted file mode 100644 index a4b3bd16bf0..00000000000 --- a/tests/skyserve/http/mixed_cloud.yaml +++ /dev/null @@ -1,16 +0,0 @@ -resources: - cloud: gcp - cpus: 2+ - -workdir: examples/serve/http_server - -run: python3 server.py - -service: - port: 8081 - readiness_probe: - path: /health - initial_delay_seconds: 20 - replicas: 2 - controller_resources: - cloud: aws diff --git a/tests/skyserve/llm/service.yaml b/tests/skyserve/llm/service.yaml index 07ed0c29068..42f0a83af02 100644 --- a/tests/skyserve/llm/service.yaml +++ b/tests/skyserve/llm/service.yaml @@ -7,8 +7,6 @@ service: port: 8087 readiness_probe: /v1/models replicas: 1 - controller_resources: - cloud: gcp setup: | conda activate chatbot diff --git a/tests/skyserve/replica_failure/service.yaml b/tests/skyserve/replica_failure/service.yaml index 096dc8cdf34..cfba56168d8 100644 --- a/tests/skyserve/replica_failure/service.yaml +++ b/tests/skyserve/replica_failure/service.yaml @@ -16,5 +16,3 @@ service: # For install dependencies initial_delay_seconds: 180 replicas: 3 - controller_resources: - cloud: gcp diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 9d57168c4ce..e261dc02eab 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -16,6 +16,9 @@ # Only run managed spot tests # > pytest tests/test_smoke.py --managed-spot # +# Only run sky serve tests +# > pytest tests/test_smoke.py --sky-serve +# # Only run test for AWS + generic tests # > pytest tests/test_smoke.py --aws # @@ -2673,6 +2676,7 @@ def _get_skyserve_http_test(name: str, cloud: str, @pytest.mark.gcp +@pytest.mark.sky_serve def test_skyserve_gcp_http(): """Test skyserve on GCP""" name = _get_service_name() @@ -2681,6 +2685,7 @@ def test_skyserve_gcp_http(): @pytest.mark.aws +@pytest.mark.sky_serve def test_skyserve_aws_http(): """Test skyserve on AWS""" name = _get_service_name() @@ -2689,6 +2694,7 @@ def test_skyserve_aws_http(): @pytest.mark.azure +@pytest.mark.sky_serve def test_skyserve_azure_http(): """Test skyserve on Azure""" name = _get_service_name() @@ -2697,15 +2703,7 @@ def test_skyserve_azure_http(): @pytest.mark.gcp -@pytest.mark.aws -def test_skyserve_mixed_cloud_http(): - """Test skyserve on mixed cloud""" - name = _get_service_name() - test = _get_skyserve_http_test(name, 'mixed_cloud', 20) - run_one_test(test) - - -@pytest.mark.gcp +@pytest.mark.sky_serve def test_skyserve_llm(): """Test skyserve with real LLM usecase""" name = _get_service_name() @@ -2737,6 +2735,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: @pytest.mark.gcp +@pytest.mark.sky_serve def test_skyserve_replica_failure(): """Test skyserve with manually interrupting some replica""" name = _get_service_name() @@ -2783,6 +2782,7 @@ def terminate_replica(replica_id: int) -> str: @pytest.mark.gcp +@pytest.mark.sky_serve def test_skyserve_auto_restart(): """Test skyserve with auto restart""" name = _get_service_name() From 31554534ba48ec01c94c5abc421a4fbedee3a4a3 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 12 Oct 2023 17:16:34 -0700 Subject: [PATCH 117/223] fix smoke test --- sky/serve/__init__.py | 1 + sky/serve/constants.py | 3 +++ sky/serve/infra_providers.py | 3 +-- tests/test_smoke.py | 15 ++++++++++----- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index a5d6ab93c19..028479eb415 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -8,6 +8,7 @@ from sky.serve.constants import CONTROLLER_RESOURCES from sky.serve.constants import CONTROLLER_SYNC_INTERVAL from sky.serve.constants import CONTROLLER_TEMPLATE +from sky.serve.constants import ENDPOINT_PROBE_INTERVAL from sky.serve.constants import SERVE_PREFIX from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.serve_state import ReplicaStatus diff --git a/sky/serve/constants.py b/sky/serve/constants.py index f3cbe265ce9..d897ede038a 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -24,6 +24,9 @@ # for each service, also send the number of requests in last query interval. CONTROLLER_SYNC_INTERVAL = 20 +# Interval to probe replica endpoint. +ENDPOINT_PROBE_INTERVAL = 10 + # The default timeout for a readiness probe request. We set the timeout to 15s # since using actual generation in LLM services as readiness probe is very # time-consuming (33B, 70B, ...). diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 5e6f8cffc9e..db00e1a8d4d 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -28,7 +28,6 @@ _JOB_STATUS_FETCH_INTERVAL = 30 _PROCESS_POOL_REFRESH_INTERVAL = 20 -_ENDPOINT_PROBE_INTERVAL = 10 # TODO(tian): Maybe let user determine this threshold _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180 @@ -642,7 +641,7 @@ def _replica_prober(self) -> None: # No matter what error happens, we should keep the # replica prober running. logger.error(f'Error in replica prober: {e}') - for _ in range(_ENDPOINT_PROBE_INTERVAL): + for _ in range(serve_constants.ENDPOINT_PROBE_INTERVAL): if self.replica_prober_stop_event.is_set(): logger.info('Replica prober terminated.') return diff --git a/tests/test_smoke.py b/tests/test_smoke.py index e261dc02eab..2c1bc4e3d9c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2750,6 +2750,13 @@ def terminate_replica(replica_id: int) -> str: return (f'gcloud compute instances delete --zone={zone}' f' --quiet $({query_cmd})') + # In the worst case, the controller will first wait ENDPOINT_PROBE_INTERVAL + # for next probe, and wait CONTROLLER_SYNC_INTERVAL for load balancer's + # next sync with controller. We add 5s more for any overhead, such as + # database read/write. + time_to_wait_after_terminate = (serve.ENDPOINT_PROBE_INTERVAL + + serve.CONTROLLER_SYNC_INTERVAL + 5) + test = Test( f'test-skyserve-replica-failure', [ @@ -2760,7 +2767,7 @@ def terminate_replica(replica_id: int) -> str: 'python tests/skyserve/replica_failure/test_round_robin.py ' '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3', terminate_replica(1), - f'sleep {serve.CONTROLLER_SYNC_INTERVAL}', + f'sleep {time_to_wait_after_terminate}', f'sky serve status {name} | grep 2/3', f'{_get_replica_line(name, 1)} | grep NOT_READY', f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 2)}; ' @@ -2768,7 +2775,7 @@ def terminate_replica(replica_id: int) -> str: 'python tests/skyserve/replica_failure/test_round_robin.py ' '--endpoint $endpoint --replica-num 2 --replica-ips $ip2 $ip3', terminate_replica(2), - f'sleep {serve.CONTROLLER_SYNC_INTERVAL}', + f'sleep {time_to_wait_after_terminate}', f'sky serve status {name} | grep 1/3', f'{_get_replica_line(name, 2)} | grep NOT_READY', f'{_get_serve_endpoint(name)}; {_get_replica_ip(name, 3)}; ' @@ -2807,11 +2814,9 @@ def terminate_replica(replica_id: int) -> str: f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', terminate_replica(1), 'sleep 180', # Wait for consecutive failure timeout passed. - # Currently failed replica will still count as replica num in sky serve status. - # TODO(tian): Fix this in the future. '(while true; do' f' output=$(sky serve status {name});' - ' echo "$output" | grep -q "1/2" && break;' + ' echo "$output" | grep -q "1/1" && break;' ' sleep 10;' f'done); sleep {serve.CONTROLLER_SYNC_INTERVAL};', f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', From 2afdfc376c21ea13bc895263fcfc8817acd67389 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 13 Oct 2023 12:12:08 -0700 Subject: [PATCH 118/223] refactor request information report --- sky/serve/autoscalers.py | 38 +++++++++++++++++------- sky/serve/constants.py | 7 +++++ sky/serve/controller.py | 32 ++++++++++---------- sky/serve/load_balancer.py | 37 ++++++++--------------- sky/serve/load_balancing_policies.py | 31 +------------------- sky/serve/serve_utils.py | 44 ++++++++++++++++++++++++++++ 6 files changed, 109 insertions(+), 80 deletions(-) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 433e0a75a2d..d6e340758ba 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -1,11 +1,13 @@ """Autoscalers: perform autoscaling by monitoring metrics.""" +import bisect import logging import threading import time -from typing import Optional +from typing import List, Optional from sky.serve import constants from sky.serve import infra_providers +from sky.serve import serve_utils logger = logging.getLogger(__name__) @@ -35,6 +37,10 @@ def __init__(self, 'controller sync interval. It might ' 'not always got the latest information.') + def update_request_information( + self, request_information: serve_utils.RequestInformation) -> None: + raise NotImplementedError + def evaluate_scaling(self) -> None: raise NotImplementedError @@ -48,14 +54,17 @@ def scale_down(self, num_nodes_to_remove: int) -> None: def run(self) -> None: logger.info('Starting autoscaler monitor.') - while not self.run_thread_stop_event.is_set(): + while True: try: self.evaluate_scaling() except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # monitor running. logger.error(f'Error in autoscaler: {e}') - time.sleep(self.frequency) + for _ in range(self.frequency): + if self.run_thread_stop_event.is_set(): + return + time.sleep(1) def start(self) -> None: self.run_thread_stop_event = threading.Event() @@ -86,18 +95,24 @@ def __init__(self, *args, upper_threshold: Optional[float], self.query_interval: int = query_interval # Time of last scale operation self.last_scale_operation: float = 0. - # Number of requests in the last `query_interval` seconds. - self.num_requests: int = 0 + # All request timestamps + self.request_timestamps: List[float] = [] # Upper threshold for scale up. If None, no scale up. self.upper_threshold: Optional[float] = upper_threshold # Lower threshold for scale down. If None, no scale down. self.lower_threshold: Optional[float] = lower_threshold - def set_num_requests(self, num_requests: int) -> None: - self.num_requests = num_requests - - def get_query_interval(self) -> int: - return self.query_interval + def update_request_information( + self, request_information: serve_utils.RequestInformation) -> None: + if not isinstance(request_information, serve_utils.RequestTimestamp): + raise ValueError('Request information must be of type ' + 'serve_utils.RequestTimestamp for ' + 'RequestRateAutoscaler.') + self.request_timestamps.extend(request_information.get()) + current_time = time.time() + index = bisect.bisect_left(self.request_timestamps, + current_time - self.query_interval) + self.request_timestamps = self.request_timestamps[index:] def evaluate_scaling(self) -> None: current_time = time.time() @@ -117,7 +132,8 @@ def evaluate_scaling(self) -> None: return # Convert to requests per second. - num_requests_per_second = float(self.num_requests) / self.query_interval + num_requests_per_second = float(len( + self.request_timestamps)) / self.query_interval # Edge case: num_nodes is zero. requests_per_node = (num_requests_per_second / num_nodes if num_nodes else num_requests_per_second) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index d897ede038a..6e5e66731fa 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -36,6 +36,13 @@ # Wait for 1 minutes for controller / load balancer to terminate. SERVE_TERMINATE_WAIT_TIMEOUT = 60 +# Autoscaler query interval (window size) for request per second. We calculate +# rps by divide the number of requests in last query interval by this interval. +AUTOSCALER_QUERY_INTERVAL = 60 +# Autoscaler scale frequency. We will try to scale up/down every +# `scale_frequency`. +AUTOSCALER_SCALE_FREQUENCY = 20 + # The default controller resources. # We need 200 GB disk space to enable using Azure as controller, since its image # size is 150 GB. Also, we need 32 GB memory to run our controller and load diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 430edf99f71..5ad096fe69e 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -17,6 +17,7 @@ from sky import serve from sky import sky_logging from sky.serve import autoscalers +from sky.serve import constants from sky.serve import infra_providers from sky.serve import serve_state from sky.serve import serve_utils @@ -66,23 +67,24 @@ def _check_terminate(self): def run(self) -> None: - @self.app.post('/controller/update_num_requests') - def update_num_requests(request: fastapi.Request): - # await request + @self.app.post('/controller/report_request_information') + def report_request_information(request: fastapi.Request): request_data = asyncio.run(request.json()) - # get request data - num_requests = request_data['num_requests'] - logger.info(f'Received request: {request_data}') + request_information_payload = request_data.get( + 'request_information') + request_information = pickle.loads( + base64.b64decode(request_information_payload)) + logger.info( + f'Received request information: {request_information!r}') if isinstance(self.autoscaler, autoscalers.RequestRateAutoscaler): - self.autoscaler.set_num_requests(num_requests) + if not isinstance(request_information, + serve_utils.RequestTimestamp): + raise ValueError('Request information must be of type ' + 'serve_utils.RequestTimestamp for ' + 'RequestRateAutoscaler.') + self.autoscaler.update_request_information(request_information) return {'message': 'Success'} - @self.app.get('/controller/get_autoscaler_query_interval') - def get_autoscaler_query_interval(): - if isinstance(self.autoscaler, autoscalers.RequestRateAutoscaler): - return {'query_interval': self.autoscaler.get_query_interval()} - return {'query_interval': None} - @self.app.get('/controller/get_ready_replicas') def get_ready_replicas(): return {'ready_replicas': self.infra_provider.get_ready_replicas()} @@ -183,13 +185,13 @@ def terminate(request: fastapi.Request): _autoscaler = autoscalers.RequestRateAutoscaler( _infra_provider, auto_restart=service_spec.auto_restart, - frequency=20, + frequency=constants.AUTOSCALER_SCALE_FREQUENCY, min_nodes=service_spec.min_replicas, max_nodes=service_spec.max_replicas, upper_threshold=service_spec.qps_upper_threshold, lower_threshold=service_spec.qps_lower_threshold, cooldown=60, - query_interval=60) + query_interval=constants.AUTOSCALER_QUERY_INTERVAL) # ======= SkyServeController ========= controller = SkyServeController(args.controller_port, _infra_provider, diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 7a3afe43321..5bb99e1d74d 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -1,11 +1,12 @@ """LoadBalancer: redirect any incoming request to an endpoint replica.""" import argparse +import base64 +import pickle import threading import time import fastapi import requests -from urllib3 import exceptions import uvicorn from sky import sky_logging @@ -38,24 +39,8 @@ def __init__( # This is the port where the replica app listens to. self.replica_port = replica_port self.load_balancing_policy = load_balancing_policy - self.setup_query_interval() - - def setup_query_interval(self): - for _ in range(3): - try: - resp = requests.get(self.controller_url + - '/controller/get_autoscaler_query_interval') - except exceptions.MaxRetryError: - # Retry if cannot connect to controller - continue - if resp.status_code == 200: - self.load_balancing_policy.set_query_interval( - resp.json()['query_interval']) - return - time.sleep(10) - logger.error('Failed to get autoscaler query interval. ' - 'Use default interval instead.') - self.load_balancing_policy.set_query_interval(None) + self.request_information: serve_utils.RequestInformation = ( + serve_utils.RequestTimestamp()) def _sync_with_controller(self): while True: @@ -74,14 +59,18 @@ def _sync_with_controller(self): logger.info('Controller is terminating. ' 'Shutting down load balancer.') serve_utils.kill_children_and_self_processes() - # send request num in last query interval + # send request information response = session.post( - self.controller_url + '/controller/update_num_requests', + self.controller_url + + '/controller/report_request_information', json={ - 'num_requests': self.load_balancing_policy. - deprecate_old_requests() + 'request_information': base64.b64encode( + pickle.dumps(self.request_information) + ).decode('utf-8') }, timeout=5) + # Clean up after reporting request information to avoid OOM. + self.request_information.clear() response.raise_for_status() # get replica ips response = session.get(self.controller_url + @@ -97,7 +86,7 @@ def _sync_with_controller(self): time.sleep(constants.CONTROLLER_SYNC_INTERVAL) async def _redirect_handler(self, request: fastapi.Request): - self.load_balancing_policy.increment_request_count(1) + self.request_information.add(request) replica_ip = self.load_balancing_policy.select_replica(request) if replica_ip is None: diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index e187b1874a9..bfb01661aed 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -1,46 +1,17 @@ """LoadBalancingPolicy: Policy to select endpoint.""" -import collections import logging -import time -from typing import Deque, List, Optional, Set +from typing import List, Optional, Set import fastapi logger = logging.getLogger(__name__) -_DEFAULT_QUERY_INTERVAL = 60 - class LoadBalancingPolicy: """Abstract class for load balancing policies.""" def __init__(self) -> None: self.ready_replicas: Set[str] = set() - self.request_count: int = 0 - self.request_timestamps: Deque[float] = collections.deque() - self.query_interval: Optional[float] = None - - def increment_request_count(self, count: int = 1) -> None: - self.request_count += count - self.request_timestamps.append(time.time()) - - def set_query_interval(self, query_interval: Optional[float]) -> None: - if query_interval is not None: - self.query_interval = query_interval - else: - self.query_interval = _DEFAULT_QUERY_INTERVAL - - def deprecate_old_requests(self) -> int: - if self.query_interval is None: - logger.error('Query interval is not set. ' - 'Use default interval instead.') - self.set_query_interval(None) - assert self.query_interval is not None - # TODO(tian): Optimize by binary search. - while (self.request_timestamps and - time.time() - self.request_timestamps[0] > self.query_interval): - self.request_timestamps.popleft() - return len(self.request_timestamps) def set_ready_replicas(self, ready_replicas: Set[str]) -> None: raise NotImplementedError diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index fc9e73d2ca6..c571d49b910 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -27,6 +27,8 @@ from sky.utils import subprocess_utils if typing.TYPE_CHECKING: + import fastapi + import sky _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}' @@ -84,6 +86,48 @@ def values(self): return self._dict.values() +class RequestInformation: + """Base class for request information.""" + + def add(self, request: 'fastapi.Request') -> None: + """Add a request to the request information.""" + raise NotImplementedError + + def get(self) -> List[Any]: + """Get all current request information.""" + raise NotImplementedError + + def clear(self) -> None: + """Clear all current request information.""" + raise NotImplementedError + + def __repr__(self) -> str: + raise NotImplementedError + + +class RequestTimestamp(RequestInformation): + """RequestTimestamp: Request information that stores request timestamps.""" + + def __init__(self) -> None: + self.timestamps: List[float] = [] + + def add(self, request: 'fastapi.Request') -> None: + """Add a request to the request information.""" + del request # unused + self.timestamps.append(time.time()) + + def get(self) -> List[float]: + """Get all current request information.""" + return self.timestamps + + def clear(self) -> None: + """Clear all current request information.""" + self.timestamps = [] + + def __repr__(self) -> str: + return f'RequestTimestamp(timestamps={self.timestamps})' + + def kill_children_and_self_processes() -> None: subprocess_utils.kill_children_processes() os.kill(os.getpid(), signal.SIGKILL) From 1fd1db5faca1211ea6ff6c1c60a80501cc30439c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 13 Oct 2023 13:14:09 -0700 Subject: [PATCH 119/223] architecture: autoscaler no longer talk with infra provider again --- sky/serve/autoscalers.py | 142 ++++++++++++++++++----------------- sky/serve/constants.py | 6 +- sky/serve/controller.py | 43 ++++++++++- sky/serve/infra_providers.py | 16 ---- 4 files changed, 117 insertions(+), 90 deletions(-) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index d6e340758ba..c87294fa79a 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -1,14 +1,19 @@ """Autoscalers: perform autoscaling by monitoring metrics.""" import bisect +import dataclasses +import enum import logging -import threading import time +import typing from typing import List, Optional from sky.serve import constants -from sky.serve import infra_providers +from sky.serve import serve_state from sky.serve import serve_utils +if typing.TYPE_CHECKING: + from sky.serve import infra_providers + logger = logging.getLogger(__name__) # Since sky.launch is very resource demanding, we limit the number of @@ -17,22 +22,32 @@ _MAX_BOOTSTRAPPING_NUM = 5 +class AutoscalerDecisionOperator(enum.Enum): + SCALE_UP = 'scale_up' + SCALE_DOWN = 'scale_down' + NO_OP = 'no_op' + + +@dataclasses.dataclass +class AutoscalerDecision: + operator: AutoscalerDecisionOperator + num_replicas: Optional[int] + + class Autoscaler: """Abstract class for autoscalers.""" def __init__(self, - infra_provider: infra_providers.InfraProvider, auto_restart: bool, frequency: int, min_nodes: int = 1, max_nodes: Optional[int] = None) -> None: - self.infra_provider = infra_provider self.auto_restart = auto_restart self.min_nodes: int = min_nodes # Default to fixed node, i.e. min_nodes == max_nodes. self.max_nodes: int = max_nodes or min_nodes self.frequency = frequency # Time to sleep in seconds. - if frequency < constants.CONTROLLER_SYNC_INTERVAL: + if self.frequency < constants.CONTROLLER_SYNC_INTERVAL: logger.warning('Autoscaler frequency is less than ' 'controller sync interval. It might ' 'not always got the latest information.') @@ -41,40 +56,11 @@ def update_request_information( self, request_information: serve_utils.RequestInformation) -> None: raise NotImplementedError - def evaluate_scaling(self) -> None: + def evaluate_scaling( + self, + infos: List['infra_providers.ReplicaInfo']) -> AutoscalerDecision: raise NotImplementedError - def scale_up(self, num_nodes_to_add: int) -> None: - logger.debug(f'Scaling up by {num_nodes_to_add} nodes') - self.infra_provider.scale_up(num_nodes_to_add) - - def scale_down(self, num_nodes_to_remove: int) -> None: - logger.debug(f'Scaling down by {num_nodes_to_remove} nodes') - self.infra_provider.scale_down(num_nodes_to_remove) - - def run(self) -> None: - logger.info('Starting autoscaler monitor.') - while True: - try: - self.evaluate_scaling() - except Exception as e: # pylint: disable=broad-except - # No matter what error happens, we should keep the - # monitor running. - logger.error(f'Error in autoscaler: {e}') - for _ in range(self.frequency): - if self.run_thread_stop_event.is_set(): - return - time.sleep(1) - - def start(self) -> None: - self.run_thread_stop_event = threading.Event() - self.run_thread = threading.Thread(target=self.run) - self.run_thread.start() - - def terminate(self) -> None: - self.run_thread_stop_event.set() - self.run_thread.join() - class RequestRateAutoscaler(Autoscaler): """RequestRateAutoscaler: Autoscale according to request rate. @@ -85,14 +71,12 @@ class RequestRateAutoscaler(Autoscaler): def __init__(self, *args, upper_threshold: Optional[float], lower_threshold: Optional[float], cooldown: int, - query_interval: int, **kwargs) -> None: + rps_window_size: int, **kwargs) -> None: super().__init__(*args, **kwargs) # Cooldown between two scaling operations in seconds. self.cooldown: int = cooldown - # Query interval for requests num. Every `query_interval` seconds, - # Autoscaler will received an update for number of requests from - # load balancer. - self.query_interval: int = query_interval + # Window size for rps calculating. + self.rps_window_size: int = rps_window_size # Time of last scale operation self.last_scale_operation: float = 0. # All request timestamps @@ -111,13 +95,19 @@ def update_request_information( self.request_timestamps.extend(request_information.get()) current_time = time.time() index = bisect.bisect_left(self.request_timestamps, - current_time - self.query_interval) + current_time - self.rps_window_size) self.request_timestamps = self.request_timestamps[index:] - def evaluate_scaling(self) -> None: + def evaluate_scaling( + self, + infos: List['infra_providers.ReplicaInfo']) -> AutoscalerDecision: current_time = time.time() - num_nodes = self.infra_provider.total_replica_num( - count_failed_replica=not self.auto_restart) + if not self.auto_restart: + num_nodes = len(infos) + else: + num_nodes = len([ + i for i in infos if i.status != serve_state.ReplicaStatus.FAILED + ]) # Check if cooldown period has passed since the last scaling operation. # Only cooldown if bootstrapping is done. @@ -129,11 +119,12 @@ def evaluate_scaling(self) -> None: f'cooldown: {self.cooldown}') logger.info('Cooldown period has not passed since last scaling ' 'operation. Skipping scaling.') - return + return AutoscalerDecision(AutoscalerDecisionOperator.NO_OP, + num_replicas=None) # Convert to requests per second. - num_requests_per_second = float(len( - self.request_timestamps)) / self.query_interval + num_requests_per_second = len( + self.request_timestamps) / self.rps_window_size # Edge case: num_nodes is zero. requests_per_node = (num_requests_per_second / num_nodes if num_nodes else num_requests_per_second) @@ -144,24 +135,41 @@ def evaluate_scaling(self) -> None: logger.info(f'Number of nodes: {num_nodes}') if num_nodes < self.min_nodes: logger.info('Bootstrapping service.') - self.scale_up( - min(self.min_nodes - num_nodes, _MAX_BOOTSTRAPPING_NUM)) self.last_scale_operation = current_time - elif (self.upper_threshold is not None and - requests_per_node > self.upper_threshold): + return AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP, + num_replicas=min( + self.min_nodes - num_nodes, + _MAX_BOOTSTRAPPING_NUM)) + if (self.upper_threshold is not None and + requests_per_node > self.upper_threshold): if num_nodes < self.max_nodes: - logger.info('Requests per node is above upper threshold ' - f'{self.upper_threshold}qps/node. ' - 'Scaling up by 1 node.') - self.scale_up(1) - self.last_scale_operation = current_time - elif (self.lower_threshold is not None and - requests_per_node < self.lower_threshold): + scale_target = requests_per_node / self.upper_threshold + num_nodes_to_add = int(scale_target * num_nodes) - num_nodes + if num_nodes_to_add > 0: + plural = 's' if num_nodes_to_add > 1 else '' + logger.info( + 'Requests per node is above upper threshold ' + f'{self.upper_threshold}qps/node. ' + f'Scaling up by {num_nodes_to_add} node{plural}.') + self.last_scale_operation = current_time + return AutoscalerDecision( + AutoscalerDecisionOperator.SCALE_UP, + num_replicas=num_nodes_to_add) + if (self.lower_threshold is not None and + requests_per_node < self.lower_threshold): if num_nodes > self.min_nodes: - logger.info('Requests per node is below lower threshold ' - f'{self.lower_threshold}qps/node. ' - 'Scaling down by 1 node.') - self.scale_down(1) - self.last_scale_operation = current_time - else: - logger.info('No scaling needed.') + scale_target = requests_per_node / self.lower_threshold + num_nodes_to_remove = num_nodes - int(scale_target * num_nodes) + if num_nodes_to_remove > 0: + plural = 's' if num_nodes_to_remove > 1 else '' + logger.info( + 'Requests per node is below lower threshold ' + f'{self.lower_threshold}qps/node. ' + f'Scaling down by {num_nodes_to_remove} node{plural}.') + self.last_scale_operation = current_time + return AutoscalerDecision( + AutoscalerDecisionOperator.SCALE_DOWN, + num_replicas=num_nodes_to_remove) + logger.info('No scaling needed.') + return AutoscalerDecision(AutoscalerDecisionOperator.NO_OP, + num_replicas=None) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 6e5e66731fa..a830f0224cf 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -36,9 +36,9 @@ # Wait for 1 minutes for controller / load balancer to terminate. SERVE_TERMINATE_WAIT_TIMEOUT = 60 -# Autoscaler query interval (window size) for request per second. We calculate -# rps by divide the number of requests in last query interval by this interval. -AUTOSCALER_QUERY_INTERVAL = 60 +# Autoscaler window size for request per second. We calculate rps by divide the +# number of requests in last window size by this window size. +AUTOSCALER_RPS_WINDOW_SIZE = 60 # Autoscaler scale frequency. We will try to scale up/down every # `scale_frequency`. AUTOSCALER_SCALE_FREQUENCY = 20 diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 5ad096fe69e..ca27455057a 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -65,6 +65,41 @@ def _check_terminate(self): serve_utils.kill_children_and_self_processes() time.sleep(10) + def _run_autoscaler(self): + logger.info('Starting autoscaler monitor.') + while True: + try: + replica_info = self.infra_provider.get_replica_info( + verbose=env_options.Options.SHOW_DEBUG_INFO.get()) + logger.info(f'All replica info: {replica_info}') + scaling_option = self.autoscaler.evaluate_scaling(replica_info) + if (scaling_option.operator == + autoscalers.AutoscalerDecisionOperator.SCALE_UP): + assert scaling_option.num_replicas is not None + self.infra_provider.scale_up(scaling_option.num_replicas) + elif (scaling_option.operator == + autoscalers.AutoscalerDecisionOperator.SCALE_DOWN): + assert scaling_option.num_replicas is not None + self.infra_provider.scale_down(scaling_option.num_replicas) + except Exception as e: # pylint: disable=broad-except + # No matter what error happens, we should keep the + # monitor running. + logger.error(f'Error in autoscaler: {e}') + for _ in range(self.autoscaler.frequency): + if self.autoscaler_stop_event.is_set(): + logger.info('Autoscaler monitor terminated.') + return + time.sleep(1) + + def _start_autoscaler(self): + self.autoscaler_stop_event = threading.Event() + self.autoscaler_thread = threading.Thread(target=self._run_autoscaler) + self.autoscaler_thread.start() + + def _terminate_autoscaler(self): + self.autoscaler_stop_event.set() + self.autoscaler_thread.join() + def run(self) -> None: @self.app.post('/controller/report_request_information') @@ -72,6 +107,7 @@ def report_request_information(request: fastapi.Request): request_data = asyncio.run(request.json()) request_information_payload = request_data.get( 'request_information') + logger.info(request_information_payload) request_information = pickle.loads( base64.b64decode(request_information_payload)) logger.info( @@ -124,7 +160,7 @@ def terminate(request: fastapi.Request): self.infra_provider.service_name, serve_state.ServiceStatus.SHUTTING_DOWN) logger.info('Terminate autoscaler...') - self.autoscaler.terminate() + self._terminate_autoscaler() msg = self.infra_provider.terminate() if msg is None: # We cannot terminate the controller now because we still @@ -132,7 +168,7 @@ def terminate(request: fastapi.Request): self.terminating = True return {'message': msg} - self.autoscaler.start() + self._start_autoscaler() # Start a daemon to check if the controller is terminating, and if so, # shutdown the controller so the skypilot jobs will finish, thus enable @@ -183,7 +219,6 @@ def terminate(request: fastapi.Request): # ======= Autoscaler ========= _autoscaler = autoscalers.RequestRateAutoscaler( - _infra_provider, auto_restart=service_spec.auto_restart, frequency=constants.AUTOSCALER_SCALE_FREQUENCY, min_nodes=service_spec.min_replicas, @@ -191,7 +226,7 @@ def terminate(request: fastapi.Request): upper_threshold=service_spec.qps_upper_threshold, lower_threshold=service_spec.qps_lower_threshold, cooldown=60, - query_interval=constants.AUTOSCALER_QUERY_INTERVAL) + rps_window_size=constants.AUTOSCALER_RPS_WINDOW_SIZE) # ======= SkyServeController ========= controller = SkyServeController(args.controller_port, _infra_provider, diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index db00e1a8d4d..8521cbc7129 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -22,7 +22,6 @@ from sky.serve import serve_utils from sky.skylet import constants from sky.skylet import job_lib -from sky.utils import env_options logger = logging.getLogger(__name__) @@ -262,10 +261,6 @@ def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: # Get replica info for all replicas raise NotImplementedError - def total_replica_num(self, count_failed_replica: bool) -> int: - # Returns the total number of replicas - raise NotImplementedError - def get_ready_replicas(self) -> Set[str]: # Returns the endpoints of all ready replicas raise NotImplementedError @@ -455,13 +450,6 @@ def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: for info in serve_state.get_replica_infos(self.service_name) ] - def total_replica_num(self, count_failed_replica: bool) -> int: - infos = serve_state.get_replica_infos(self.service_name) - if count_failed_replica: - return len(infos) - return len( - [i for i in infos if i.status != serve_state.ReplicaStatus.FAILED]) - def get_ready_replicas(self) -> Set[str]: ready_replicas = set() infos = serve_state.get_replica_infos(self.service_name) @@ -655,10 +643,6 @@ def _start_replica_prober(self) -> None: @with_lock def _probe_all_replicas(self) -> None: - replica_info = self.get_replica_info( - verbose=env_options.Options.SHOW_DEBUG_INFO.get()) - logger.info(f'All replica info: {replica_info}') - probe_futures = [] replica_to_probe = [] with futures.ThreadPoolExecutor() as executor: From 42e1f4ad67993c3e39671a7617cf5d4c051fa803 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 13 Oct 2023 14:02:12 -0700 Subject: [PATCH 120/223] fix auto restart --- sky/serve/autoscalers.py | 20 ++++++++------------ sky/serve/controller.py | 1 - sky/serve/infra_providers.py | 8 ++++++-- sky/serve/serve_state.py | 6 +++--- sky/serve/serve_utils.py | 8 +++++--- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index c87294fa79a..df2ed9995bc 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -4,16 +4,12 @@ import enum import logging import time -import typing -from typing import List, Optional +from typing import Any, Dict, List, Optional from sky.serve import constants from sky.serve import serve_state from sky.serve import serve_utils -if typing.TYPE_CHECKING: - from sky.serve import infra_providers - logger = logging.getLogger(__name__) # Since sky.launch is very resource demanding, we limit the number of @@ -56,9 +52,9 @@ def update_request_information( self, request_information: serve_utils.RequestInformation) -> None: raise NotImplementedError - def evaluate_scaling( - self, - infos: List['infra_providers.ReplicaInfo']) -> AutoscalerDecision: + def evaluate_scaling(self, infos: List[Dict[str, + Any]]) -> AutoscalerDecision: + """Evaluate autoscale options based on replica information.""" raise NotImplementedError @@ -98,15 +94,15 @@ def update_request_information( current_time - self.rps_window_size) self.request_timestamps = self.request_timestamps[index:] - def evaluate_scaling( - self, - infos: List['infra_providers.ReplicaInfo']) -> AutoscalerDecision: + def evaluate_scaling(self, infos: List[Dict[str, + Any]]) -> AutoscalerDecision: current_time = time.time() if not self.auto_restart: num_nodes = len(infos) else: num_nodes = len([ - i for i in infos if i.status != serve_state.ReplicaStatus.FAILED + i for i in infos + if i['status'] != serve_state.ReplicaStatus.FAILED ]) # Check if cooldown period has passed since the last scaling operation. diff --git a/sky/serve/controller.py b/sky/serve/controller.py index ca27455057a..b92ec663de1 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -107,7 +107,6 @@ def report_request_information(request: fastapi.Request): request_data = asyncio.run(request.json()) request_information_payload = request_data.get( 'request_information') - logger.info(request_information_payload) request_information = pickle.loads( base64.b64decode(request_information_payload)) logger.info( diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 8521cbc7129..e91a79e06ad 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -623,8 +623,12 @@ def _replica_prober(self) -> None: logger.info('Running replica prober.') try: self._probe_all_replicas() - serve_utils.set_service_status_from_replica_info( - self.service_name, self.get_replica_info(verbose=True)) + replica_statuses = [ + info['status'] + for info in self.get_replica_info(verbose=False) + ] + serve_utils.set_service_status_from_replica_statuses( + self.service_name, replica_statuses) except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # replica prober running. diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index f268d3153b3..2d0feda8c89 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -126,9 +126,9 @@ def colored_str(self): return f'{color}{self.value}{colorama.Style.RESET_ALL}' @classmethod - def from_replica_info( - cls, replica_info: List[Dict[str, Any]]) -> 'ServiceStatus': - status2num = collections.Counter([i['status'] for i in replica_info]) + def from_replica_statuses( + cls, replica_statuses: List[ReplicaStatus]) -> 'ServiceStatus': + status2num = collections.Counter(replica_statuses) # If one replica is READY, the service is READY. if status2num[ReplicaStatus.READY] > 0: return cls.READY diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index c571d49b910..048bed5e979 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -320,8 +320,9 @@ def get_available_controller_name() -> Tuple[str, bool]: key=lambda k: controller2slots[k]), False -def set_service_status_from_replica_info( - service_name: str, replica_info: List[Dict[str, Any]]) -> None: +def set_service_status_from_replica_statuses( + service_name: str, + replica_statuses: List[serve_state.ReplicaStatus]) -> None: record = serve_state.get_service_from_name(service_name) if record is None: raise ValueError(f'Service {service_name!r} does not exist. ' @@ -333,7 +334,8 @@ def set_service_status_from_replica_info( # change service status to READY. return serve_state.set_service_status( - service_name, serve_state.ServiceStatus.from_replica_info(replica_info)) + service_name, + serve_state.ServiceStatus.from_replica_statuses(replica_statuses)) def update_service_status() -> None: From 3f81cdba89a3712f6e9c9a999297e52682457aba Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 13 Oct 2023 14:29:52 -0700 Subject: [PATCH 121/223] sync down logs and then streaming --- sky/backends/backend_utils.py | 4 +- sky/serve/infra_providers.py | 71 +++++++++++++++++------------------ sky/serve/serve_utils.py | 8 +++- 3 files changed, 44 insertions(+), 39 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 3349e08ff55..8e8679c505f 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2842,12 +2842,13 @@ def _refresh_service(service_name: str) -> Optional[Dict[str, Any]]: def download_and_stream_latest_job_log( backend: 'cloud_vm_ray_backend.CloudVmRayBackend', handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle', local_dir: str, - log_position_hint: str, log_finish_hint: str) -> None: + log_position_hint: str, log_finish_hint: str) -> Optional[str]: """Downloads and streams the latest job log. This function is only used by spot controller and sky serve controller. """ os.makedirs(local_dir, exist_ok=True) + log_file = None try: log_dirs = backend.sync_down_logs( handle, @@ -2879,6 +2880,7 @@ def download_and_stream_latest_job_log( f'program at {log_file}.') else: logger.info(f'\n== End of logs ({log_finish_hint}) ==') + return log_file @typing.overload diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index e91a79e06ad..42f3465e628 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -403,16 +403,7 @@ def _fetch_job_status(self) -> None: info.replica_id, info) logger.warning( f'User APP for replica {info.replica_id} FAILED. ' - 'Start streaming logs...') - replica_job_logs_dir = os.path.join( - constants.SKY_LOGS_DIRECTORY, 'replica_jobs') - backend_utils.download_and_stream_latest_job_log( - backend, - handle, - replica_job_logs_dir, - log_position_hint='replica cluster', - log_finish_hint=f'Replica: {info.replica_id}') - logger.info('Terminating...') + 'Terminating...') self._teardown_replica(info.replica_id, sync_down_logs=True) def _job_status_fetcher(self) -> None: @@ -489,36 +480,44 @@ def scale_up(self, n: int) -> None: self._launch_replica(self.next_replica_id) self.next_replica_id += 1 - def _teardown_replica(self, - replica_id: int, - sync_down_logs: bool = True) -> None: + def _teardown_replica(self, replica_id: int, sync_down_logs: bool) -> None: if replica_id in self.down_process_pool: logger.warning(f'Down process for replica {replica_id} already ' 'exists. Skipping.') return - if sync_down_logs: + def _sync_down_logs(): + info = serve_state.get_replica_info_from_id(self.service_name, + replica_id) + if info is None: + logger.error(f'Cannot find replica {replica_id} in the ' + 'replica table. Skipping syncing down logs.') + return logger.info(f'Syncing down logs for replica {replica_id}...') - # TODO(tian): Maybe use - # backend_utils.download_and_stream_latest_job_log here - code = serve_utils.ServeCodeGen.stream_replica_logs( - self.service_name, - replica_id, - follow=False, - skip_local_log_file_check=True) - local_log_file_name = ( - serve_utils.generate_replica_local_log_file_name( - self.service_name, replica_id)) - with open(local_log_file_name, 'w') as f: - try: - subprocess.run(code, shell=True, check=True, stdout=f) - except Exception as e: # pylint: disable=broad-except - # No matter what error happens, we should teardown the - # cluster. - msg = ('Error in syncing down logs for replica ' - f'{replica_id}: {e}') - logger.error(msg) - print(msg, file=f) + backend = backends.CloudVmRayBackend() + handle = global_user_state.get_handle_from_cluster_name( + info.cluster_name) + if handle is None: + logger.error(f'Cannot find cluster {info.cluster_name} ' + 'in the cluster table. Skipping syncing ' + 'down logs.') + return + replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, + 'replica_jobs') + log_file = backend_utils.download_and_stream_latest_job_log( + backend, + handle, + replica_job_logs_dir, + log_position_hint='replica cluster', + log_finish_hint=f'Replica: {replica_id}') + if log_file is not None: + local_log_file_name = ( + serve_utils.generate_replica_local_log_file_name( + self.service_name, replica_id)) + os.rename(log_file, local_log_file_name) + + if sync_down_logs: + _sync_down_logs() logger.info(f'Deleting replica {replica_id}') info = serve_state.get_replica_info_from_id(self.service_name, @@ -546,7 +545,7 @@ def scale_down(self, n: int) -> None: f'only {len(infos)} replicas. Scale down all ' 'replicas instead.') for i in range(min(n, len(infos))): - self._teardown_replica(infos[i].replica_id) + self._teardown_replica(infos[i].replica_id, sync_down_logs=False) # TODO(tian): Maybe just kill all threads and cleanup using db record def terminate(self) -> Optional[str]: @@ -717,4 +716,4 @@ def _probe_all_replicas(self) -> None: serve_state.add_or_update_replica(self.service_name, info.replica_id, info) if should_teardown: - self._teardown_replica(info.replica_id) + self._teardown_replica(info.replica_id, sync_down_logs=True) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 048bed5e979..82d29b99a86 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -50,6 +50,10 @@ class ServiceComponent(enum.Enum): ValueType = TypeVar('ValueType') +# Google style guide: Do not rely on the atomicity of built-in types. +# Our launch and down process pool will be used by multiple threads, +# therefore we need to use a thread-safe dict. +# see https://google.github.io/styleguide/pyguide.html#218-threading class ThreadSafeDict(Generic[KeyType, ValueType]): """A thread-safe dict.""" @@ -616,8 +620,8 @@ def _get_replica_status() -> serve_state.ReplicaStatus: f'of replica {replica_id}...{colorama.Style.RESET_ALL}') backend = backends.CloudVmRayBackend() - # Always tail the logs of the first job, which represent user setup & run. - returncode = backend.tail_logs(handle, job_id=1, follow=follow) + # Always tail the latest logs, which represent user setup & run. + returncode = backend.tail_logs(handle, job_id=None, follow=follow) if returncode != 0: return (f'{colorama.Fore.RED}Failed to stream logs for replica ' f'{replica_id}.{colorama.Style.RESET_ALL}') From 692d8444c5727ad51a4a886b524c94a56cf97dbb Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 13 Oct 2023 14:40:20 -0700 Subject: [PATCH 122/223] terminate log streaming when service is downed --- sky/serve/serve_state.py | 10 +++++++--- sky/serve/serve_utils.py | 16 +++++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 2d0feda8c89..ce467e793b4 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -76,10 +76,10 @@ class ReplicaStatus(enum.Enum): UNKNOWN = 'UNKNOWN' @classmethod - def failed_statuses(cls): + def failed_statuses(cls) -> List['ReplicaStatus']: return [cls.FAILED, cls.FAILED_CLEANUP, cls.UNKNOWN] - def colored_str(self): + def colored_str(self) -> str: color = _REPLICA_STATUS_TO_COLOR[self] return f'{color}{self.value}{colorama.Style.RESET_ALL}' @@ -121,7 +121,11 @@ class ServiceStatus(enum.Enum): # At least one replica is failed and no replica is ready FAILED = 'FAILED' - def colored_str(self): + @classmethod + def failed_statuses(cls) -> List['ServiceStatus']: + return [cls.CONTROLLER_FAILED, cls.UNKNOWN, cls.FAILED] + + def colored_str(self) -> str: color = _SERVICE_STATUS_TO_COLOR[self] return f'{color}{self.value}{colorama.Style.RESET_ALL}' diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 82d29b99a86..ab77122e065 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -628,7 +628,8 @@ def _get_replica_status() -> serve_state.ReplicaStatus: return '' -def _follow_logs(file: TextIO, exit_if_stream_end: bool) -> Iterator[str]: +def _follow_logs(file: TextIO, *, finish_stream: Callable[[], bool], + exit_if_stream_end: bool) -> Iterator[str]: line = '' while True: tmp = file.readline() @@ -638,7 +639,7 @@ def _follow_logs(file: TextIO, exit_if_stream_end: bool) -> Iterator[str]: yield line line = '' else: - if exit_if_stream_end: + if exit_if_stream_end or finish_stream(): break time.sleep(1) @@ -652,8 +653,17 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool, log_file = generate_remote_controller_log_file_name(service_name) else: log_file = generate_remote_load_balancer_log_file_name(service_name) + + def _service_is_terminal() -> bool: + record = serve_state.get_service_from_name(service_name) + if record is None: + return True + return record['status'] in serve_state.ServiceStatus.failed_statuses() + with open(os.path.expanduser(log_file), 'r', newline='') as f: - for line in _follow_logs(f, exit_if_stream_end=not follow): + for line in _follow_logs(f, + finish_stream=_service_is_terminal, + exit_if_stream_end=not follow): print(line, end='', flush=True) return '' From 8d4f64d2f953edc948640afa713e724e1f3fc893 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 13 Oct 2023 15:21:30 -0700 Subject: [PATCH 123/223] refactor autoscaler & argument pass in controller --- sky/serve/autoscalers.py | 150 +++++++++++++++++++++-------------- sky/serve/constants.py | 3 + sky/serve/controller.py | 24 +++--- sky/serve/infra_providers.py | 40 ++++------ sky/serve/load_balancer.py | 7 ++ 5 files changed, 125 insertions(+), 99 deletions(-) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index df2ed9995bc..2857dfdaa91 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -4,12 +4,16 @@ import enum import logging import time -from typing import Any, Dict, List, Optional +import typing +from typing import Any, Dict, List, Optional, Union from sky.serve import constants from sky.serve import serve_state from sky.serve import serve_utils +if typing.TYPE_CHECKING: + from sky.serve import service_spec + logger = logging.getLogger(__name__) # Since sky.launch is very resource demanding, we limit the number of @@ -27,22 +31,30 @@ class AutoscalerDecisionOperator(enum.Enum): @dataclasses.dataclass class AutoscalerDecision: operator: AutoscalerDecisionOperator - num_replicas: Optional[int] + target: Optional[Union[int, List[int]]] + + def __repr__(self) -> str: + return f'AutoscalerDecision({self.operator}, {self.target})' class Autoscaler: """Abstract class for autoscalers.""" - def __init__(self, - auto_restart: bool, - frequency: int, - min_nodes: int = 1, - max_nodes: Optional[int] = None) -> None: - self.auto_restart = auto_restart - self.min_nodes: int = min_nodes - # Default to fixed node, i.e. min_nodes == max_nodes. - self.max_nodes: int = max_nodes or min_nodes - self.frequency = frequency # Time to sleep in seconds. + def __init__(self, spec: 'service_spec.SkyServiceSpec', + frequency: int) -> None: + """Initialize the autoscaler. + + Variables: + auto_restart: Whether to restart failed replicas. + min_replicas: Minimum number of replicas. + max_replicas: Maximum number of replicas. Default to fixed + number of replicas, i.e. min_replicas == max_replicas. + frequency: Frequency of autoscaling in seconds. + """ + self.auto_restart = spec.auto_restart + self.min_replicas: int = spec.min_replicas + self.max_replicas: int = spec.max_replicas or spec.min_replicas + self.frequency = frequency if self.frequency < constants.CONTROLLER_SYNC_INTERVAL: logger.warning('Autoscaler frequency is less than ' 'controller sync interval. It might ' @@ -50,6 +62,7 @@ def __init__(self, def update_request_information( self, request_information: serve_utils.RequestInformation) -> None: + """Update request information for autoscaling.""" raise NotImplementedError def evaluate_scaling(self, infos: List[Dict[str, @@ -65,22 +78,26 @@ class RequestRateAutoscaler(Autoscaler): the threshold. """ - def __init__(self, *args, upper_threshold: Optional[float], - lower_threshold: Optional[float], cooldown: int, - rps_window_size: int, **kwargs) -> None: - super().__init__(*args, **kwargs) - # Cooldown between two scaling operations in seconds. + def __init__(self, spec: 'service_spec.SkyServiceSpec', frequency: int, + cooldown: int, rps_window_size: int) -> None: + """Initialize the request rate autoscaler. + + Variables: + upper_threshold: Upper threshold for scale up. If None, no scale up. + lower_threshold: Lower threshold for scale down. If None, no scale + down. + cooldown: Cooldown between two scaling operations in seconds. + rps_window_size: Window size for rps calculating. + last_scale_operation: Time of last scale operation. + request_timestamps: All request timestamps within the window. + """ + super().__init__(spec, frequency) + self.upper_threshold: Optional[float] = spec.qps_upper_threshold + self.lower_threshold: Optional[float] = spec.qps_lower_threshold self.cooldown: int = cooldown - # Window size for rps calculating. self.rps_window_size: int = rps_window_size - # Time of last scale operation self.last_scale_operation: float = 0. - # All request timestamps self.request_timestamps: List[float] = [] - # Upper threshold for scale up. If None, no scale up. - self.upper_threshold: Optional[float] = upper_threshold - # Lower threshold for scale down. If None, no scale down. - self.lower_threshold: Optional[float] = lower_threshold def update_request_information( self, request_information: serve_utils.RequestInformation) -> None: @@ -98,16 +115,16 @@ def evaluate_scaling(self, infos: List[Dict[str, Any]]) -> AutoscalerDecision: current_time = time.time() if not self.auto_restart: - num_nodes = len(infos) + num_replicas = len(infos) else: - num_nodes = len([ + num_replicas = len([ i for i in infos if i['status'] != serve_state.ReplicaStatus.FAILED ]) # Check if cooldown period has passed since the last scaling operation. # Only cooldown if bootstrapping is done. - if num_nodes >= self.min_nodes: + if num_replicas >= self.min_replicas: if current_time - self.last_scale_operation < self.cooldown: logger.info( f'Current time: {current_time}, ' @@ -116,56 +133,69 @@ def evaluate_scaling(self, infos: List[Dict[str, logger.info('Cooldown period has not passed since last scaling ' 'operation. Skipping scaling.') return AutoscalerDecision(AutoscalerDecisionOperator.NO_OP, - num_replicas=None) + target=None) # Convert to requests per second. num_requests_per_second = len( self.request_timestamps) / self.rps_window_size - # Edge case: num_nodes is zero. - requests_per_node = (num_requests_per_second / num_nodes - if num_nodes else num_requests_per_second) + # Edge case: num_replicas is zero. + requests_per_replica = (num_requests_per_second / num_replicas + if num_replicas else num_requests_per_second) - logger.info(f'Requests per node: {requests_per_node}') + logger.info(f'Requests per replica: {requests_per_replica}') # Bootstrap case - logger.info(f'Number of nodes: {num_nodes}') - if num_nodes < self.min_nodes: + logger.info(f'Number of replicas: {num_replicas}') + if num_replicas < self.min_replicas: logger.info('Bootstrapping service.') self.last_scale_operation = current_time return AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP, - num_replicas=min( - self.min_nodes - num_nodes, + target=min( + self.min_replicas - num_replicas, _MAX_BOOTSTRAPPING_NUM)) if (self.upper_threshold is not None and - requests_per_node > self.upper_threshold): - if num_nodes < self.max_nodes: - scale_target = requests_per_node / self.upper_threshold - num_nodes_to_add = int(scale_target * num_nodes) - num_nodes - if num_nodes_to_add > 0: - plural = 's' if num_nodes_to_add > 1 else '' - logger.info( - 'Requests per node is above upper threshold ' - f'{self.upper_threshold}qps/node. ' - f'Scaling up by {num_nodes_to_add} node{plural}.') + requests_per_replica > self.upper_threshold): + if num_replicas < self.max_replicas: + scale_target = requests_per_replica / self.upper_threshold + num_replicas_to_add = max(int(scale_target * num_replicas), + self.max_replicas) - num_replicas + if num_replicas_to_add > 0: + plural = 's' if num_replicas_to_add > 1 else '' + logger.info('Requests per replica is above upper threshold ' + f'{self.upper_threshold}qps / replica. ' + f'Scaling up by {num_replicas_to_add} ' + f'replica{plural}.') self.last_scale_operation = current_time return AutoscalerDecision( AutoscalerDecisionOperator.SCALE_UP, - num_replicas=num_nodes_to_add) + target=num_replicas_to_add) if (self.lower_threshold is not None and - requests_per_node < self.lower_threshold): - if num_nodes > self.min_nodes: - scale_target = requests_per_node / self.lower_threshold - num_nodes_to_remove = num_nodes - int(scale_target * num_nodes) - if num_nodes_to_remove > 0: - plural = 's' if num_nodes_to_remove > 1 else '' - logger.info( - 'Requests per node is below lower threshold ' - f'{self.lower_threshold}qps/node. ' - f'Scaling down by {num_nodes_to_remove} node{plural}.') + requests_per_replica < self.lower_threshold): + if num_replicas > self.min_replicas: + scale_target = requests_per_replica / self.lower_threshold + num_replicas_to_remove = num_replicas - min( + int(scale_target * num_replicas), self.min_replicas) + if num_replicas_to_remove > 0: + plural = 's' if num_replicas_to_remove > 1 else '' + logger.info('Requests per replica is below lower threshold ' + f'{self.lower_threshold}qps / replica. ' + f'Scaling down by {num_replicas_to_remove} ' + f'replica{plural}.') self.last_scale_operation = current_time + # Remove FAILED replicas first. + replica_ids_to_remove: List[int] = [] + for i in infos: + if len(replica_ids_to_remove) >= num_replicas_to_remove: + break + if i['status'] == serve_state.ReplicaStatus.FAILED: + replica_ids_to_remove.append(i['replica_id']) + # Then rest of them. + for i in infos: + if len(replica_ids_to_remove) >= num_replicas_to_remove: + break + replica_ids_to_remove.append(i['replica_id']) return AutoscalerDecision( AutoscalerDecisionOperator.SCALE_DOWN, - num_replicas=num_nodes_to_remove) + target=replica_ids_to_remove) logger.info('No scaling needed.') - return AutoscalerDecision(AutoscalerDecisionOperator.NO_OP, - num_replicas=None) + return AutoscalerDecision(AutoscalerDecisionOperator.NO_OP, target=None) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index a830f0224cf..a1dd1f34d95 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -42,6 +42,9 @@ # Autoscaler scale frequency. We will try to scale up/down every # `scale_frequency`. AUTOSCALER_SCALE_FREQUENCY = 20 +# Autoscaler cooldown time. We will not scale up/down if the last scale up/down +# is within this cooldown time. +AUTOSCALER_COOLDOWN_SECONDS = 60 # The default controller resources. # We need 200 GB disk space to enable using Azure as controller, since its image diff --git a/sky/serve/controller.py b/sky/serve/controller.py index b92ec663de1..c3204512caa 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -75,12 +75,14 @@ def _run_autoscaler(self): scaling_option = self.autoscaler.evaluate_scaling(replica_info) if (scaling_option.operator == autoscalers.AutoscalerDecisionOperator.SCALE_UP): - assert scaling_option.num_replicas is not None - self.infra_provider.scale_up(scaling_option.num_replicas) + assert isinstance(scaling_option.target, + int), scaling_option + self.infra_provider.scale_up(scaling_option.target) elif (scaling_option.operator == autoscalers.AutoscalerDecisionOperator.SCALE_DOWN): - assert scaling_option.num_replicas is not None - self.infra_provider.scale_down(scaling_option.num_replicas) + assert isinstance(scaling_option.target, + list), scaling_option + self.infra_provider.scale_down(scaling_option.target) except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # monitor running. @@ -210,21 +212,13 @@ def terminate(request: fastapi.Request): # ======= Infra Provider ========= service_spec = serve.SkyServiceSpec.from_yaml(args.task_yaml) _infra_provider = infra_providers.SkyPilotInfraProvider( - args.task_yaml, - args.service_name, - readiness_suffix=service_spec.readiness_suffix, - initial_delay_seconds=service_spec.initial_delay_seconds, - post_data=service_spec.post_data) + args.service_name, service_spec, task_yaml_path=args.task_yaml) # ======= Autoscaler ========= _autoscaler = autoscalers.RequestRateAutoscaler( - auto_restart=service_spec.auto_restart, + service_spec, frequency=constants.AUTOSCALER_SCALE_FREQUENCY, - min_nodes=service_spec.min_replicas, - max_nodes=service_spec.max_replicas, - upper_threshold=service_spec.qps_upper_threshold, - lower_threshold=service_spec.qps_lower_threshold, - cooldown=60, + cooldown=constants.AUTOSCALER_COOLDOWN_SECONDS, rps_window_size=constants.AUTOSCALER_RPS_WINDOW_SIZE) # ======= SkyServeController ========= diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 42f3465e628..e54a2880f77 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -9,6 +9,7 @@ import subprocess import threading import time +import typing from typing import Any, Dict, List, Optional, Set, Tuple, Union import psutil @@ -23,6 +24,9 @@ from sky.skylet import constants from sky.skylet import job_lib +if typing.TYPE_CHECKING: + from sky.serve import service_spec + logger = logging.getLogger(__name__) _JOB_STATUS_FETCH_INTERVAL = 30 @@ -240,18 +244,14 @@ def probe( class InfraProvider: """Each infra provider manages one service.""" - def __init__( - self, - service_name: str, - readiness_suffix: str, - initial_delay_seconds: int, - post_data: Optional[Union[str, Dict[str, Any]]] = None) -> None: + def __init__(self, service_name: str, + spec: 'service_spec.SkyServiceSpec') -> None: self.lock = threading.Lock() self.next_replica_id: int = 1 self.service_name: str = service_name - self.readiness_suffix: str = readiness_suffix - self.initial_delay_seconds: int = initial_delay_seconds - self.post_data: Optional[Union[str, Dict[str, Any]]] = post_data + self.readiness_suffix: str = spec.readiness_suffix + self.initial_delay_seconds: int = spec.initial_delay_seconds + self.post_data: Optional[Union[str, Dict[str, Any]]] = spec.post_data self.uptime: Optional[float] = None logger.info(f'Readiness probe suffix: {self.readiness_suffix}') logger.info(f'Initial delay seconds: {self.initial_delay_seconds}') @@ -268,9 +268,7 @@ def get_ready_replicas(self) -> Set[str]: def scale_up(self, n: int) -> None: raise NotImplementedError - def scale_down(self, n: int) -> None: - # TODO - Scale down must also pass in a list of replicas to - # delete or the number of replicas to delete + def scale_down(self, replica_ids: List[int]) -> None: raise NotImplementedError def terminate(self) -> Optional[str]: @@ -281,8 +279,9 @@ def terminate(self) -> Optional[str]: class SkyPilotInfraProvider(InfraProvider): """Infra provider for SkyPilot clusters.""" - def __init__(self, task_yaml_path: str, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) + def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', + task_yaml_path: str) -> None: + super().__init__(service_name, spec) self.task_yaml_path: str = task_yaml_path self.launch_process_pool: serve_utils.ThreadSafeDict[ int, subprocess.Popen] = serve_utils.ThreadSafeDict() @@ -536,16 +535,9 @@ def _sync_down_logs(): info.status_property.sky_down_status = ProcessStatus.RUNNING serve_state.add_or_update_replica(self.service_name, replica_id, info) - def scale_down(self, n: int) -> None: - # Terminate n replicas - # TODO(tian): Policy to choose replica to scale down. - infos = serve_state.get_replica_infos(self.service_name) - if len(infos) < n: - logger.error(f'Cannot scale down {n} replicas since there are ' - f'only {len(infos)} replicas. Scale down all ' - 'replicas instead.') - for i in range(min(n, len(infos))): - self._teardown_replica(infos[i].replica_id, sync_down_logs=False) + def scale_down(self, replica_ids: List[int]) -> None: + for replica_id in replica_ids: + self._teardown_replica(replica_id, sync_down_logs=False) # TODO(tian): Maybe just kill all threads and cleanup using db record def terminate(self) -> Optional[str]: diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 5bb99e1d74d..3fb80647a95 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -43,6 +43,13 @@ def __init__( serve_utils.RequestTimestamp()) def _sync_with_controller(self): + """Sync with controller periodically. + + Every `constants.CONTROLLER_SYNC_INTERVAL` seconds, the load balancer + will sync with the controller to get the latest information about + available replicas; also, it report the request information to the + controller, so that the controller can make autoscaling decisions. + """ while True: with requests.Session() as session: try: From 673c74717bf0c11d252053f01705df0320b18aed Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 13 Oct 2023 16:48:24 -0700 Subject: [PATCH 124/223] move several var from local db --- sky/backends/backend_utils.py | 11 ++++++++--- sky/cli.py | 3 +-- sky/execution.py | 3 --- sky/serve/autoscalers.py | 1 + sky/serve/controller.py | 4 ++++ sky/serve/infra_providers.py | 16 ++++++++++++++++ sky/serve/serve_utils.py | 9 --------- sky/utils/cli_utils/status_utils.py | 23 ++++++----------------- 8 files changed, 36 insertions(+), 34 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 8e8679c505f..2245cfdca2f 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2715,9 +2715,14 @@ def _add_default_value_to_local_record( # NOTE(dev): Keep this align with sky.serve.controller.get_latest_info if record is None: return record - record['status'] = serve_lib.ServiceStatus.UNKNOWN - record['uptime'] = None - record['replica_info'] = [] + record.update({ + 'replica_info': [], + 'uptime': None, + 'status': serve_lib.ServiceStatus.UNKNOWN, + 'policy': '', + 'auto_restart': False, + 'requested_resources': sky.Resources(), + }) return record diff --git a/sky/cli.py b/sky/cli.py index aa8d01999ac..5fa51196cec 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4272,13 +4272,12 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): f'Replicas{colorama.Style.RESET_ALL}') replica_infos = [] for service_record in service_records: - handle: serve_lib.ServiceHandle = service_record['handle'] for replica_record in service_record['replica_info']: # Only print FAILED replicas if: # 1. --all is specified; # 2. auto_restart is not enabled (in which FAILED replica count # as one replica). - if (all or not handle.auto_restart or + if (all or not service_record['auto_restart'] or replica_record['status'] != serve_lib.ReplicaStatus.FAILED): replica_record['service_name'] = service_record['name'] replica_infos.append(replica_record) diff --git a/sky/execution.py b/sky/execution.py index e4d4b487aea..c26482224a7 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1037,10 +1037,7 @@ def serve_up( service_handle = serve.ServiceHandle( service_name=service_name, - policy=task.service.policy_str(), - requested_resources=requested_resources, requested_controller_resources=controller_resources, - auto_restart=task.service.auto_restart, controller_port=controller_port, load_balancer_port=load_balancer_port) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 2857dfdaa91..88fb2fdb463 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -54,6 +54,7 @@ def __init__(self, spec: 'service_spec.SkyServiceSpec', self.auto_restart = spec.auto_restart self.min_replicas: int = spec.min_replicas self.max_replicas: int = spec.max_replicas or spec.min_replicas + self.policy_str = spec.policy_str() self.frequency = frequency if self.frequency < constants.CONTROLLER_SYNC_INTERVAL: logger.warning('Autoscaler frequency is less than ' diff --git a/sky/serve/controller.py b/sky/serve/controller.py index c3204512caa..fadba560681 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -146,6 +146,10 @@ def get_latest_info(): 'uptime': record.get('uptime', None), 'status': record.get('status', serve_state.ServiceStatus.UNKNOWN), + 'policy': self.autoscaler.policy_str, + 'auto_restart': self.autoscaler.auto_restart, + 'requested_resources': + self.infra_provider.get_requested_resources(), } latest_info = { k: base64.b64encode(pickle.dumps(v)).decode('utf-8') diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index e54a2880f77..a00aed7fec3 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -14,9 +14,11 @@ import psutil import requests +import yaml from sky import backends from sky import global_user_state +from sky import resources from sky.backends import backend_utils from sky.serve import constants as serve_constants from sky.serve import serve_state @@ -265,6 +267,10 @@ def get_ready_replicas(self) -> Set[str]: # Returns the endpoints of all ready replicas raise NotImplementedError + def get_requested_resources(self) -> resources.Resources: + # Returns the requested resources for the service + raise NotImplementedError + def scale_up(self, n: int) -> None: raise NotImplementedError @@ -449,6 +455,16 @@ def get_ready_replicas(self) -> Set[str]: ready_replicas.add(info.ip) return ready_replicas + def get_requested_resources(self) -> resources.Resources: + with open(self.task_yaml_path, 'r') as f: + config = yaml.safe_load(f) + resources_config = None + if isinstance(config, dict): + resources_config = config.get('resources') + if resources_config is None: + return resources.Resources() + return resources.Resources.from_yaml_config(resources_config) + def _launch_replica(self, replica_id: int) -> None: cluster_name = serve_utils.generate_replica_cluster_name( self.service_name, replica_id) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index ab77122e065..526b28ebeaf 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -377,10 +377,7 @@ def __init__( self, *, service_name: str, - policy: str, - requested_resources: 'sky.Resources', requested_controller_resources: 'sky.Resources', - auto_restart: bool, controller_port: int, load_balancer_port: int, endpoint_ip: Optional[str] = None, @@ -388,10 +385,7 @@ def __init__( ) -> None: self._version = self._VERSION self.service_name = service_name - self.policy = policy - self.requested_resources = requested_resources self.requested_controller_resources = requested_controller_resources - self.auto_restart = auto_restart self.controller_port = controller_port self.load_balancer_port = load_balancer_port self.endpoint_ip = endpoint_ip @@ -400,11 +394,8 @@ def __init__( def __repr__(self) -> str: return ('ServiceHandle(' f'\n\tservice_name={self.service_name},' - f'\n\tpolicy={self.policy},' - f'\n\trequested_resources={self.requested_resources},' '\n\trequested_controller_resources=' f'{self.requested_controller_resources},' - f'\n\tauto_restart={self.auto_restart},' f'\n\tcontroller_port={self.controller_port},' f'\n\tload_balancer_port={self.load_balancer_port},' f'\n\tendpoint_ip={self.endpoint_ip},' diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index ef7d21b006b..cd31ca49f3e 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -390,27 +390,17 @@ def show_local_status_table(local_clusters: List[str]): _get_duration = (lambda cluster_record: log_utils.readable_time_duration( 0, cluster_record['duration'], absolute=True)) _get_replica_id = lambda replica_record: replica_record['replica_id'] -_get_service_name = (lambda replica_record: replica_record['service_name']) +_get_service_name = lambda replica_record: replica_record['service_name'] +_get_controller_name = lambda replica_record: replica_record['controller_name'] +_get_policy = lambda replica_record: replica_record['policy'] +_get_requested_resources = lambda replica_record: replica_record[ + 'requested_resources'] def _get_service_handle(service_record: _ServiceRecord) -> serve.ServiceHandle: return service_record['handle'] -def _get_controller_name(service_record: _ServiceRecord) -> str: - return service_record['controller_name'] - - -def _get_policy(service_record: _ServiceRecord) -> str: - handle = _get_service_handle(service_record) - return handle.policy - - -def _get_requested_resources(service_record: _ServiceRecord) -> 'sky.Resources': - handle = _get_service_handle(service_record) - return handle.requested_resources - - def _get_uptime(service_record: _ServiceRecord) -> str: uptime = service_record['uptime'] if uptime is None: @@ -420,12 +410,11 @@ def _get_uptime(service_record: _ServiceRecord) -> str: def _get_replicas(service_record: _ServiceRecord) -> str: ready_replica_num, total_replica_num = 0, 0 - auto_restart = _get_service_handle(service_record).auto_restart for info in service_record['replica_info']: if _get_status(info) == serve.ReplicaStatus.READY: ready_replica_num += 1 # If auto restart enabled, not count FAILED replicas here. - if (not auto_restart or + if (not service_record['auto_restart'] or _get_status(info) != serve.ReplicaStatus.FAILED): total_replica_num += 1 return f'{ready_replica_num}/{total_replica_num}' From 003c6552c9f7719fc4887033109a6ac0c740ab12 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 15 Oct 2023 16:35:00 -0700 Subject: [PATCH 125/223] use a control process to handle signal and terminate service --- sky/core.py | 70 +------- sky/data/storage.py | 34 ++-- sky/exceptions.py | 5 + sky/execution.py | 8 - sky/serve/__init__.py | 1 - sky/serve/constants.py | 6 + sky/serve/controller.py | 125 +++----------- sky/serve/infra_providers.py | 151 ++++------------- sky/serve/load_balancer.py | 65 ++------ sky/serve/serve_utils.py | 160 +++++++++--------- sky/serve/service.py | 181 +++++++++++++++++++++ sky/sky_logging.py | 12 ++ sky/templates/sky-serve-controller.yaml.j2 | 23 +-- sky/utils/subprocess_utils.py | 47 +++--- 14 files changed, 394 insertions(+), 494 deletions(-) create mode 100644 sky/serve/service.py diff --git a/sky/core.py b/sky/core.py index 89e21148b75..39f426a8029 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1145,7 +1145,6 @@ def serve_down(service_name: str, purge: bool = False) -> None: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service {service_name!r} not found.') - service_handle: serve.ServiceHandle = service_record['handle'] controller_name = service_record['controller_name'] handle = global_user_state.get_handle_from_cluster_name(controller_name) @@ -1165,12 +1164,8 @@ def serve_down(service_name: str, purge: bool = False) -> None: code = serve.ServeCodeGen.terminate_service(service_name) try: - (returncode, terminate_service_payload, - stderr) = backend.run_on_head(handle, - code, - require_outputs=True, - stream_logs=False, - separate_stderr=True) + returncode, stdout, _ = backend.run_on_head( + handle, code, require_outputs=True, stream_logs=False) except exceptions.FetchIPError as e: raise RuntimeError(controller_fetch_ip_error_message) from e @@ -1178,25 +1173,9 @@ def serve_down(service_name: str, purge: bool = False) -> None: returncode, code, ('Failed when submit termination request to controller ' f'of service {service_name!r}'), - stderr, + stdout, stream_logs=False) - resp = serve.load_terminate_service_result( - terminate_service_payload) - if resp.status_code != 200: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError('Failed to terminate replica of service ' - f'{service_name!r} due to request ' - f'failure: {resp.text}') - msg = resp.json().get('message') - if msg: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'Unexpected message when tearing down replica of ' - f'service {service_name!r}: {msg}. Please login to ' - 'the controller by `ssh ` and ' - 'make sure the service is properly cleaned up.') - # We want to make sure no matter what error happens, we can still # clean up the record if purge is True. # pylint: disable=broad-except @@ -1209,52 +1188,9 @@ def serve_down(service_name: str, purge: bool = False) -> None: with ux_utils.print_exception_no_traceback(): raise RuntimeError(e) from e - try: - if handle is not None: - assert isinstance(handle, backends.CloudVmRayResourceHandle) - backend = backends.CloudVmRayBackend() - backend.register_info(minimize_logging=True) - - # Cleanup all files on controller related to this service. - # We have a 10-min grace period for the controller to autostop, - # so it should be fine if this is the last service on the - # controller and its job is the only one running. - # Also, Cleanup the service record in controller VM - code = serve.ServeCodeGen.cleanup_service(service_name) - returncode, _, stderr = backend.run_on_head(handle, - code, - require_outputs=True, - stream_logs=False, - separate_stderr=True) - subprocess_utils.handle_returncode( - returncode, - code, (f'Failed cleaning up service {service_name!r}'), - stderr, - stream_logs=False) - - # same as above. - # pylint: disable=broad-except - except Exception as e: - if purge: - logger.warning( - f'Ignoring error when clean up service {service_name!r}: ' - f'{common_utils.format_exception(e)}') - else: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(e) from e - # TODO(tian): Maybe add a post_cleanup function? controller_yaml_path = serve.generate_controller_yaml_file_name( service_name) if os.path.exists(controller_yaml_path): os.remove(controller_yaml_path) - try: - service_handle.cleanup_ephemeral_storage() - # same as above. - except Exception as e: # pylint: disable=broad-except - if purge: - logger.warning('Ignoring error when cleaning up ephemeral storage ' - f'of service {service_name}: {e}') - else: - raise RuntimeError(e) from e global_user_state.remove_service(service_name) diff --git a/sky/data/storage.py b/sky/data/storage.py index 8a40f1b986d..9dd755a1bba 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -277,7 +277,7 @@ def upload(self) -> None: """ raise NotImplementedError - def delete(self, silent: bool = False) -> None: + def delete(self) -> None: """Removes the Storage object from the cloud.""" raise NotImplementedError @@ -495,7 +495,7 @@ def __init__(self, (isinstance(self.source, list) or not data_utils.is_cloud_store_url(self.source))): msg = ' and uploading from source' - logger.debug(f'Verifying bucket{msg} for storage {self.name}') + logger.info(f'Verifying bucket{msg} for storage {self.name}') self.sync_all_stores() else: @@ -726,7 +726,7 @@ def add_store(self, store_type: Union[str, StoreType]) -> AbstractStore: store_type = StoreType(store_type) if store_type in self.stores: - logger.debug(f'Storage type {store_type} already exists.') + logger.info(f'Storage type {store_type} already exists.') return self.stores[store_type] store_cls: Type[AbstractStore] @@ -786,9 +786,7 @@ def _add_store(self, store: AbstractStore, is_reconstructed: bool = False): global_user_state.add_or_update_storage(self.name, self.handle, StorageStatus.INIT) - def delete(self, - store_type: Optional[StoreType] = None, - silent: bool = False) -> None: + def delete(self, store_type: Optional[StoreType] = None) -> None: """Deletes data for all sky-managed storage objects. If a storage is not managed by sky, it is not deleted from the cloud. @@ -808,7 +806,7 @@ def delete(self, # remove handle and return if is_sky_managed: self.handle.remove_store(store) - store.delete(silent=silent) + store.delete() # Check remaining stores - if none is sky managed, remove # the storage from global_user_state. delete = all( @@ -818,16 +816,16 @@ def delete(self, else: global_user_state.set_storage_handle(self.name, self.handle) elif self.force_delete: - store.delete(silent=silent) + store.delete() # Remove store from bookkeeping del self.stores[store_type] else: for _, store in self.stores.items(): if store.is_sky_managed: self.handle.remove_store(store) - store.delete(silent=silent) + store.delete() elif self.force_delete: - store.delete(silent=silent) + store.delete() self.stores = {} # Remove storage from global_user_state if present global_user_state.remove_storage(self.name) @@ -1092,10 +1090,8 @@ def upload(self): raise exceptions.StorageUploadError( f'Upload failed for store {self.name}') from e - def delete(self, silent: bool = False) -> None: + def delete(self) -> None: deleted_by_skypilot = self._delete_s3_bucket(self.name) - if silent: - return if deleted_by_skypilot: msg_str = f'Deleted S3 bucket {self.name}.' else: @@ -1492,10 +1488,8 @@ def upload(self): raise exceptions.StorageUploadError( f'Upload failed for store {self.name}') from e - def delete(self, silent: bool = False) -> None: + def delete(self) -> None: deleted_by_skypilot = self._delete_gcs_bucket(self.name) - if silent: - return if deleted_by_skypilot: msg_str = f'Deleted GCS bucket {self.name}.' else: @@ -1866,10 +1860,8 @@ def upload(self): raise exceptions.StorageUploadError( f'Upload failed for store {self.name}') from e - def delete(self, silent: bool = False) -> None: + def delete(self) -> None: deleted_by_skypilot = self._delete_r2_bucket(self.name) - if silent: - return if deleted_by_skypilot: msg_str = f'Deleted R2 bucket {self.name}.' else: @@ -2273,10 +2265,8 @@ def upload(self): raise exceptions.StorageUploadError( f'Upload failed for store {self.name}') from e - def delete(self, silent: bool = False) -> None: + def delete(self) -> None: self._delete_cos_bucket() - if silent: - return logger.info(f'{colorama.Fore.GREEN}Deleted COS bucket {self.name}.' f'{colorama.Style.RESET_ALL}') diff --git a/sky/exceptions.py b/sky/exceptions.py index b6c676ec700..2eda6144c7c 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -250,3 +250,8 @@ def __init__(self, region: str, self.reason = reason super().__init__(reason.message) + + +class ServeUserTerminatedError(Exception): + """Raised when a user tear down the service.""" + pass diff --git a/sky/execution.py b/sky/execution.py index c26482224a7..40c54474ee9 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1056,13 +1056,6 @@ def serve_up( '`sky serve up` process hanging abnormally.') from e _maybe_translate_local_file_mounts_and_sync_up(task, prefix='serve') - ephemeral_storage = [] - if task.storage_mounts is not None: - for storage in task.storage_mounts.values(): - if not storage.persistent: - ephemeral_storage.append(storage.to_yaml_config()) - service_handle.ephemeral_storage = ephemeral_storage - global_user_state.set_service_handle(service_name, service_handle) with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', mode='w') as f: @@ -1086,7 +1079,6 @@ def serve_up( 'service_name': service_name, 'controller_port': controller_port, 'load_balancer_port': load_balancer_port, - 'replica_port': task.service.replica_port, 'controller_log_file': controller_log_file, 'load_balancer_log_file': load_balancer_log_file, 'envs': _shared_controller_env_vars(), diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 028479eb415..fe81076f411 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -22,7 +22,6 @@ from sky.serve.serve_utils import generate_replica_cluster_name from sky.serve.serve_utils import get_available_controller_name from sky.serve.serve_utils import load_latest_info -from sky.serve.serve_utils import load_terminate_service_result from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent from sky.serve.serve_utils import ServiceHandle diff --git a/sky/serve/constants.py b/sky/serve/constants.py index a1dd1f34d95..7db4cb248ed 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -19,6 +19,12 @@ CONTROLLER_FILE_LOCK_PATH = f'{SERVE_PREFIX}/controller.lock' CONTROLLER_FILE_LOCK_TIMEOUT = 20 +# Signal file path for controller to handle signals. +SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}' + +# Timeout for `sky serve down`. +SERVICE_TERMINATION_TIMEOUT = 180 + # The time interval for load balancer to sync with controller. Every time the # load balancer syncs with controller, it will update all available replica ips # for each service, also send the number of requests in last query interval. diff --git a/sky/serve/controller.py b/sky/serve/controller.py index fadba560681..c05cd121d85 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -2,7 +2,6 @@ Responsible for autoscaling and replica management. """ -import argparse import asyncio import base64 import logging @@ -13,7 +12,6 @@ import fastapi import uvicorn -from sky import authentication from sky import serve from sky import sky_logging from sky.serve import autoscalers @@ -44,27 +42,21 @@ class SkyServeController: - Providing the HTTP Server API for SkyServe to communicate with. """ - def __init__(self, port: int, infra_provider: infra_providers.InfraProvider, - autoscaler: autoscalers.Autoscaler) -> None: + def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec, + task_yaml: str, port: int) -> None: + self.infra_provider: infra_providers.InfraProvider = ( + infra_providers.SkyPilotInfraProvider(service_name, + service_spec, + task_yaml_path=task_yaml)) + self.autoscaler: autoscalers.Autoscaler = ( + autoscalers.RequestRateAutoscaler( + service_spec, + frequency=constants.AUTOSCALER_SCALE_FREQUENCY, + cooldown=constants.AUTOSCALER_COOLDOWN_SECONDS, + rps_window_size=constants.AUTOSCALER_RPS_WINDOW_SIZE)) self.port = port - self.infra_provider = infra_provider - self.autoscaler = autoscaler - self.terminating = False - self.load_balancer_received_terminal_signal = False self.app = fastapi.FastAPI() - def _check_terminate(self): - while True: - if self.terminating and self.load_balancer_received_terminal_signal: - # 1s grace period for the rare case that terminate is set but - # return of /terminate request is not ready yet. - time.sleep(1) - logger.info('Terminate controller...') - # TODO(tian): Directly kill all threads and cleanup using db - # record, instead of waiting the threads to receive signal. - serve_utils.kill_children_and_self_processes() - time.sleep(10) - def _run_autoscaler(self): logger.info('Starting autoscaler monitor.') while True: @@ -87,25 +79,12 @@ def _run_autoscaler(self): # No matter what error happens, we should keep the # monitor running. logger.error(f'Error in autoscaler: {e}') - for _ in range(self.autoscaler.frequency): - if self.autoscaler_stop_event.is_set(): - logger.info('Autoscaler monitor terminated.') - return - time.sleep(1) - - def _start_autoscaler(self): - self.autoscaler_stop_event = threading.Event() - self.autoscaler_thread = threading.Thread(target=self._run_autoscaler) - self.autoscaler_thread.start() - - def _terminate_autoscaler(self): - self.autoscaler_stop_event.set() - self.autoscaler_thread.join() + time.sleep(self.autoscaler.frequency) def run(self) -> None: - @self.app.post('/controller/report_request_information') - def report_request_information(request: fastapi.Request): + @self.app.post('/controller/load_balancer_sync') + def load_balancer_sync(request: fastapi.Request): request_data = asyncio.run(request.json()) request_information_payload = request_data.get( 'request_information') @@ -120,18 +99,8 @@ def report_request_information(request: fastapi.Request): 'serve_utils.RequestTimestamp for ' 'RequestRateAutoscaler.') self.autoscaler.update_request_information(request_information) - return {'message': 'Success'} - - @self.app.get('/controller/get_ready_replicas') - def get_ready_replicas(): return {'ready_replicas': self.infra_provider.get_ready_replicas()} - @self.app.get('/controller/is_terminating') - def is_terminating(): - if self.terminating: - self.load_balancer_received_terminal_signal = True - return {'is_terminating': self.terminating} - @self.app.get('/controller/get_latest_info') def get_latest_info(): # NOTE(dev): Keep this align with @@ -157,30 +126,7 @@ def get_latest_info(): } return latest_info - @self.app.post('/controller/terminate') - def terminate(request: fastapi.Request): - del request - logger.info('Terminating service...') - serve_state.set_service_status( - self.infra_provider.service_name, - serve_state.ServiceStatus.SHUTTING_DOWN) - logger.info('Terminate autoscaler...') - self._terminate_autoscaler() - msg = self.infra_provider.terminate() - if msg is None: - # We cannot terminate the controller now because we still - # need the output of this request to be sent back. - self.terminating = True - return {'message': msg} - - self._start_autoscaler() - - # Start a daemon to check if the controller is terminating, and if so, - # shutdown the controller so the skypilot jobs will finish, thus enable - # the controller VM to autostop. - terminate_checking_daemon = threading.Thread( - target=self._check_terminate, daemon=True) - terminate_checking_daemon.start() + threading.Thread(target=self._run_autoscaler).start() # Disable all GET logs if SKYPILOT_DEBUG is not set to avoid overflowing # the controller logs. @@ -193,39 +139,8 @@ def terminate(request: fastapi.Request): uvicorn.run(self.app, host='localhost', port=self.port) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='SkyServe Controller') - parser.add_argument('--service-name', - type=str, - help='Name of the service', - required=True) - parser.add_argument('--task-yaml', - type=str, - help='Task YAML file', - required=True) - parser.add_argument('--controller-port', - type=int, - help='Port to run the controller', - required=True) - args = parser.parse_args() - - # Generate ssh key pair to avoid race condition when multiple sky.launch - # are executed at the same time. - authentication.get_or_generate_keys() - - # ======= Infra Provider ========= - service_spec = serve.SkyServiceSpec.from_yaml(args.task_yaml) - _infra_provider = infra_providers.SkyPilotInfraProvider( - args.service_name, service_spec, task_yaml_path=args.task_yaml) - - # ======= Autoscaler ========= - _autoscaler = autoscalers.RequestRateAutoscaler( - service_spec, - frequency=constants.AUTOSCALER_SCALE_FREQUENCY, - cooldown=constants.AUTOSCALER_COOLDOWN_SECONDS, - rps_window_size=constants.AUTOSCALER_RPS_WINDOW_SIZE) - - # ======= SkyServeController ========= - controller = SkyServeController(args.controller_port, _infra_provider, - _autoscaler) +def run_controller(service_name: str, service_spec: serve.SkyServiceSpec, + task_yaml: str, controller_port: int): + controller = SkyServeController(service_name, service_spec, task_yaml, + controller_port) controller.run() diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index a00aed7fec3..ac750d26fb2 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -9,6 +9,7 @@ import subprocess import threading import time +import traceback import typing from typing import Any, Dict, List, Optional, Set, Tuple, Union @@ -16,6 +17,7 @@ import requests import yaml +import sky from sky import backends from sky import global_user_state from sky import resources @@ -25,6 +27,8 @@ from sky.serve import serve_utils from sky.skylet import constants from sky.skylet import job_lib +from sky.usage import usage_lib +from sky.utils import common_utils if typing.TYPE_CHECKING: from sky.serve import service_spec @@ -37,6 +41,28 @@ _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180 +def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None: + """Terminate the sky serve replica cluster.""" + retry_cnt = 0 + while True: + try: + usage_lib.messages.usage.set_internal() + sky.down(cluster_name) + return + except ValueError: + # The cluster is already down. + return + except Exception as e: # pylint: disable=broad-except + retry_cnt += 1 + if retry_cnt >= max_retry: + raise RuntimeError('Failed to terminate the sky serve replica ' + f'cluster {cluster_name}.') from e + logger.error('Failed to terminate the sky serve replica ' + f'cluster {cluster_name}. Retrying.' + f'Details: {common_utils.format_exception(e)}') + logger.error(f' Traceback: {traceback.format_exc()}') + + def _interrupt_process_and_children(pid: int) -> None: parent_process = psutil.Process(pid) for child_process in parent_process.children(recursive=True): @@ -277,10 +303,6 @@ def scale_up(self, n: int) -> None: def scale_down(self, replica_ids: List[int]) -> None: raise NotImplementedError - def terminate(self) -> Optional[str]: - # Terminate service - raise NotImplementedError - class SkyPilotInfraProvider(InfraProvider): """Infra provider for SkyPilot clusters.""" @@ -294,9 +316,9 @@ def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', self.down_process_pool: serve_utils.ThreadSafeDict[ int, subprocess.Popen] = serve_utils.ThreadSafeDict() - self._start_process_pool_refresher() - self._start_job_status_fetcher() - self._start_replica_prober() + threading.Thread(target=self._process_pool_refresher).start() + threading.Thread(target=self._job_status_fetcher).start() + threading.Thread(target=self._replica_prober).start() # This process periodically checks all sky.launch and sky.down process # on the fly. If any of them finished, it will update the status of @@ -362,7 +384,6 @@ def _refresh_process_pool(self) -> None: serve_state.add_or_update_replica(self.service_name, replica_id, info) - # TODO(tian): Maybe use decorator? def _process_pool_refresher(self) -> None: while True: logger.info('Refreshing process pool.') @@ -372,17 +393,7 @@ def _process_pool_refresher(self) -> None: # No matter what error happens, we should keep the # process pool refresher running. logger.error(f'Error in process pool refresher: {e}') - for _ in range(_PROCESS_POOL_REFRESH_INTERVAL): - if self.process_pool_refresher_stop_event.is_set(): - logger.info('Process pool refresher terminated.') - return - time.sleep(1) - - def _start_process_pool_refresher(self) -> None: - self.process_pool_refresher_stop_event = threading.Event() - self.process_pool_refresher_thread = threading.Thread( - target=self._process_pool_refresher) - self.process_pool_refresher_thread.start() + time.sleep(_PROCESS_POOL_REFRESH_INTERVAL) @with_lock def _fetch_job_status(self) -> None: @@ -420,25 +431,7 @@ def _job_status_fetcher(self) -> None: # No matter what error happens, we should keep the # job status fetcher running. logger.error(f'Error in job status fetcher: {e}') - for _ in range(_JOB_STATUS_FETCH_INTERVAL): - if self.job_status_fetcher_stop_event.is_set(): - logger.info('Job status fetcher terminated.') - return - time.sleep(1) - - def _start_job_status_fetcher(self) -> None: - self.job_status_fetcher_stop_event = threading.Event() - self.job_status_fetcher_thread = threading.Thread( - target=self._job_status_fetcher) - self.job_status_fetcher_thread.start() - - def _terminate_daemon_threads(self) -> None: - self.replica_prober_stop_event.set() - self.job_status_fetcher_stop_event.set() - self.process_pool_refresher_stop_event.set() - self.replica_prober_thread.join() - self.job_status_fetcher_thread.join() - self.process_pool_refresher_thread.join() + time.sleep(_JOB_STATUS_FETCH_INTERVAL) def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: return [ @@ -555,76 +548,6 @@ def scale_down(self, replica_ids: List[int]) -> None: for replica_id in replica_ids: self._teardown_replica(replica_id, sync_down_logs=False) - # TODO(tian): Maybe just kill all threads and cleanup using db record - def terminate(self) -> Optional[str]: - logger.info('Terminating infra provider daemon threads...') - self._terminate_daemon_threads() - logger.info('Terminating all clusters...') - for replica_id, p in self.launch_process_pool.items(): - # Use keyboard interrupt here since sky.launch has great - # handling for it - # Edge case: sky.launched finished after the - # process_pool_refresher terminates - if p.poll() is None: - assert p.pid is not None - # Interrupt the launch process and its children. We use SIGINT - # here since sky.launch has great handling for it. - _interrupt_process_and_children(p.pid) - p.wait() - logger.info( - f'Interrupted launch process for replica {replica_id} ' - 'and deleted the cluster.') - self._teardown_replica(replica_id, sync_down_logs=False) - info = serve_state.get_replica_info_from_id( - self.service_name, replica_id) - assert info is not None - # Set to success here for correctly display as shutting down - info.status_property.sky_launch_status = ProcessStatus.SUCCEEDED - serve_state.add_or_update_replica(self.service_name, replica_id, - info) - msg = [] - infos = serve_state.get_replica_infos(self.service_name) - # TODO(tian): Move all cleanup to the control process - for info in infos: - if info.status in [ - serve_state.ReplicaStatus.FAILED_CLEANUP, - serve_state.ReplicaStatus.UNKNOWN, - ]: - msg.append(f'Replica with status {info.status} found. Please ' - 'manually check the cloud console to make sure no ' - 'resource leak.') - # Skip those already deleted and those are deleting - if info.status not in [ - serve_state.ReplicaStatus.FAILED, - serve_state.ReplicaStatus.SHUTTING_DOWN - ]: - self._teardown_replica(info.replica_id, sync_down_logs=False) - for replica_id, p in self.down_process_pool.items(): - p.wait() - logger.info(f'Down process for replica {replica_id} finished.') - if p.returncode != 0: - logger.warning(f'Down process for replica {replica_id} exited ' - f'abnormally with code {p.returncode}.') - msg.append( - f'Down process for replica {replica_id} exited abnormally ' - f'with code {p.returncode}. Please login to the ' - 'controller and make sure the replica is released.') - else: - serve_state.remove_replica(self.service_name, replica_id) - infos = serve_state.get_replica_infos(self.service_name) - for info in infos: - if not info.status in serve_state.ReplicaStatus.failed_statuses(): - # This should not happen since we already teardown all - # replicas. Here we just add a double check. - msg.append(f'Replica {info.replica_id} is not deleted. ' - 'Please login to the controller and make sure ' - 'the replica is released.') - else: - serve_state.remove_replica(self.service_name, info.replica_id) - if not msg: - return None - return '\n'.join(msg) - def _replica_prober(self) -> None: while True: logger.info('Running replica prober.') @@ -640,17 +563,7 @@ def _replica_prober(self) -> None: # No matter what error happens, we should keep the # replica prober running. logger.error(f'Error in replica prober: {e}') - for _ in range(serve_constants.ENDPOINT_PROBE_INTERVAL): - if self.replica_prober_stop_event.is_set(): - logger.info('Replica prober terminated.') - return - time.sleep(1) - - def _start_replica_prober(self) -> None: - self.replica_prober_stop_event = threading.Event() - self.replica_prober_thread = threading.Thread( - target=self._replica_prober) - self.replica_prober_thread.start() + time.sleep(serve_constants.ENDPOINT_PROBE_INTERVAL) @with_lock def _probe_all_replicas(self) -> None: diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 3fb80647a95..bb64f71aa92 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -1,5 +1,4 @@ """LoadBalancer: redirect any incoming request to an endpoint replica.""" -import argparse import base64 import pickle import threading @@ -11,7 +10,7 @@ from sky import sky_logging from sky.serve import constants -from sky.serve import load_balancing_policies +from sky.serve import load_balancing_policies as lb_policies from sky.serve import serve_utils # Use the explicit logger name so that the logger is under the @@ -28,17 +27,16 @@ class SkyServeLoadBalancer: policy. """ - def __init__( - self, controller_url: str, load_balancer_port: int, replica_port: int, - load_balancing_policy: load_balancing_policies.LoadBalancingPolicy - ) -> None: + def __init__(self, controller_url: str, load_balancer_port: int, + replica_port: int) -> None: self.app = fastapi.FastAPI() self.controller_url = controller_url # This is the port where the load balancer listens to. self.load_balancer_port = load_balancer_port # This is the port where the replica app listens to. self.replica_port = replica_port - self.load_balancing_policy = load_balancing_policy + self.load_balancing_policy: lb_policies.LoadBalancingPolicy = ( + lb_policies.RoundRobinPolicy()) self.request_information: serve_utils.RequestInformation = ( serve_utils.RequestTimestamp()) @@ -53,23 +51,9 @@ def _sync_with_controller(self): while True: with requests.Session() as session: try: - # TODO(tian): Maybe merge all of them into one request? - # check if the controller is terminating. If so, shut down - # the load balancer so the skypilot jobs will finish, thus - # enable the controller VM to autostop. - response = session.get(self.controller_url + - '/controller/is_terminating') - response.raise_for_status() - logger.debug( - f'Controller terminating status: {response.json()}') - if response.json().get('is_terminating'): - logger.info('Controller is terminating. ' - 'Shutting down load balancer.') - serve_utils.kill_children_and_self_processes() - # send request information + # Send request information response = session.post( - self.controller_url + - '/controller/report_request_information', + self.controller_url + '/controller/load_balancer_sync', json={ 'request_information': base64.b64encode( pickle.dumps(self.request_information) @@ -79,10 +63,6 @@ def _sync_with_controller(self): # Clean up after reporting request information to avoid OOM. self.request_information.clear() response.raise_for_status() - # get replica ips - response = session.get(self.controller_url + - '/controller/get_ready_replicas') - response.raise_for_status() ready_replicas = response.json()['ready_replicas'] except requests.RequestException as e: print(f'An error occurred: {e}') @@ -121,30 +101,9 @@ def run(self): uvicorn.run(self.app, host='0.0.0.0', port=self.load_balancer_port) -if __name__ == '__main__': - # Add argparse - parser = argparse.ArgumentParser(description='SkyServe Load Balancer') - parser.add_argument('--load-balancer-port', - type=int, - help='Port to run the load balancer on.', - required=True) - parser.add_argument('--replica-port', - type=int, - help='Port that runs app on replica.', - required=True) - parser.add_argument('--controller-addr', - type=str, - help='Controller address (ip:port).', - required=True) - args = parser.parse_args() - - # ======= Load Balancing Policy ========= - _load_balancing_policy = load_balancing_policies.RoundRobinPolicy() - - # ======= SkyServeLoadBalancer ========= - load_balancer = SkyServeLoadBalancer( - controller_url=args.controller_addr, - load_balancer_port=args.load_balancer_port, - replica_port=args.replica_port, - load_balancing_policy=_load_balancing_policy) +def run_load_balancer(controller_addr: str, load_balancer_port: int, + replica_port: int): + load_balancer = SkyServeLoadBalancer(controller_url=controller_addr, + load_balancer_port=load_balancer_port, + replica_port=replica_port) load_balancer.run() diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 526b28ebeaf..82facca7a61 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -2,29 +2,28 @@ import base64 import enum import os +import pathlib import pickle import re import shlex -import shutil -import signal import threading import time import typing from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, Set, - TextIO, Tuple, TypeVar) + TextIO, Tuple, Type, TypeVar) import colorama +import filelock import requests from sky import backends +from sky import exceptions from sky import global_user_state from sky import status_lib -from sky.data import storage as storage_lib from sky.serve import constants from sky.serve import serve_state from sky.skylet import job_lib from sky.utils import common_utils -from sky.utils import subprocess_utils if typing.TYPE_CHECKING: import fastapi @@ -46,6 +45,26 @@ class ServiceComponent(enum.Enum): REPLICA = 'replica' +class UserSignal(enum.Enum): + """User signal to send to controller. + + User can send signal to controller by writing to a file. The controller + will read the file and handle the signal. + """ + # Stop the controller, load balancer and all replicas. + TERMINATE = 'terminate' + + # TODO(tian): Add more signals, such as update or pause. + + def error_type(self) -> Type[Exception]: + """Get the error corresponding to the signal.""" + return _SIGNAL_TO_ERROR[self] + + +_SIGNAL_TO_ERROR = { + UserSignal.TERMINATE: exceptions.ServeUserTerminatedError, +} + KeyType = TypeVar('KeyType') ValueType = TypeVar('ValueType') @@ -132,9 +151,30 @@ def __repr__(self) -> str: return f'RequestTimestamp(timestamps={self.timestamps})' -def kill_children_and_self_processes() -> None: - subprocess_utils.kill_children_processes() - os.kill(os.getpid(), signal.SIGKILL) +class RedirectOutputTo: + """Redirect stdout and stderr to a file.""" + + def __init__(self, func: Callable, file: str) -> None: + self.func = func + self.file = file + + def run(self, *args, **kwargs): + # pylint: disable=import-outside-toplevel + import sys + + from sky import sky_logging + with open(self.file, 'w') as f: + sys.stdout = f + sys.stderr = f + # reconfigure logger since the logger is initialized before + # with previous stdout/stderr + sky_logging.reload_logger() + # The subprocess_util.run('sky status') inside + # sky.execution::_execute cannot be redirect, since we cannot + # directly operate on the stdout/stderr of the subprocess. This + # is because some code in skypilot will specify the stdout/stderr + # of the subprocess. + self.func(*args, **kwargs) def _get_existing_controller_names() -> Set[str]: @@ -359,15 +399,10 @@ class ServiceHandle(object): """A pickle-able tuple of: - (required) Service name. - - (required) Service autoscaling policy description str. - - (required) Service requested resources. - (required) Service requested controller resources. - - (required) Whether the service have auto restart enabled. - (required) Controller port. - (required) LoadBalancer port. - (optional) Service endpoint IP. - - (optional) Controller and LoadBalancer job id. - - (optional) Ephemeral storage generated for the service. This class is only used as a cache for information fetched from controller. """ @@ -381,7 +416,6 @@ def __init__( controller_port: int, load_balancer_port: int, endpoint_ip: Optional[str] = None, - ephemeral_storage: Optional[List[Dict[str, Any]]] = None, ) -> None: self._version = self._VERSION self.service_name = service_name @@ -389,7 +423,6 @@ def __init__( self.controller_port = controller_port self.load_balancer_port = load_balancer_port self.endpoint_ip = endpoint_ip - self.ephemeral_storage = ephemeral_storage def __repr__(self) -> str: return ('ServiceHandle(' @@ -398,30 +431,18 @@ def __repr__(self) -> str: f'{self.requested_controller_resources},' f'\n\tcontroller_port={self.controller_port},' f'\n\tload_balancer_port={self.load_balancer_port},' - f'\n\tendpoint_ip={self.endpoint_ip},' - f'\n\tephemeral_storage={self.ephemeral_storage})') - - def cleanup_ephemeral_storage(self) -> None: - if self.ephemeral_storage is None: - return - for storage_config in self.ephemeral_storage: - storage = storage_lib.Storage.from_yaml_config(storage_config) - storage.delete(silent=True) + f'\n\tendpoint_ip={self.endpoint_ip})') def __setstate__(self, state): self._version = self._VERSION self.__dict__.update(state) -def _get_controller_port_from_service_name(service_name: str) -> int: +def _get_latest_info(service_name: str, decode: bool = True) -> Dict[str, Any]: record = serve_state.get_service_from_name(service_name) if record is None: raise ValueError(f'Service {service_name!r} does not exist.') - return record['controller_port'] - - -def _get_latest_info(service_name: str, decode: bool = True) -> Dict[str, Any]: - controller_port = _get_controller_port_from_service_name(service_name) + controller_port = record['controller_port'] resp = requests.get( _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + '/controller/get_latest_info') @@ -446,19 +467,21 @@ def load_latest_info(payload: str) -> Dict[str, Any]: return latest_info -def terminate_service(service_name: str) -> str: - controller_port = _get_controller_port_from_service_name(service_name) - resp = requests.post( - _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + - '/controller/terminate') - resp = base64.b64encode(pickle.dumps(resp)).decode('utf-8') - return common_utils.encode_payload(resp) - - -def load_terminate_service_result(payload: str) -> Any: - terminate_resp = common_utils.decode_payload(payload) - terminate_resp = pickle.loads(base64.b64decode(terminate_resp)) - return terminate_resp +def terminate_service(service_name: str) -> None: + # Send the terminate signal to controller. + signal_file = pathlib.Path(constants.SIGNAL_FILE_PATH.format(service_name)) + # Filelock is needed to prevent race condition between signal + # check/removal and signal writing. + with filelock.FileLock(str(signal_file) + '.lock'): + with signal_file.open(mode='w') as f: + f.write(UserSignal.TERMINATE.value) + f.flush() + print(f'Service {service_name!r} is scheduled to be terminated.') + for _ in range(constants.SERVICE_TERMINATION_TIMEOUT): + record = serve_state.get_service_from_name(service_name) + if record is None: + break + time.sleep(1) def check_service_status_healthy(service_name: str) -> Optional[str]: @@ -468,9 +491,6 @@ def check_service_status_healthy(service_name: str) -> Optional[str]: if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT: return (f'Service {service_name!r} is still initializing its ' 'controller. Please try again later.') - if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED: - return (f'Service {service_name!r}\'s controller failed. ' - 'Cannot tail logs.') return None @@ -544,7 +564,6 @@ def stream_replica_logs(service_name: str, msg = check_service_status_healthy(service_name) if msg is not None: return msg - controller_port = _get_controller_port_from_service_name(service_name) print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process ' f'of replica {replica_id}.{colorama.Style.RESET_ALL}') local_log_file_name = generate_replica_local_log_file_name( @@ -574,24 +593,13 @@ def stream_replica_logs(service_name: str, f'{colorama.Style.RESET_ALL}') def _get_replica_status() -> serve_state.ReplicaStatus: - resp = requests.get( - _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + - '/controller/get_latest_info') - if resp.status_code != 200: - raise ValueError( - f'{colorama.Fore.RED}Failed to get replica info for service ' - f'{service_name}.{colorama.Style.RESET_ALL}') - replica_info = resp.json()['replica_info'] - replica_info = pickle.loads(base64.b64decode(replica_info)) - target_info: Optional[Dict[str, Any]] = None + latest_info = _get_latest_info(service_name) + replica_info = latest_info['replica_info'] for info in replica_info: if info['replica_id'] == replica_id: - target_info = info - break - if target_info is None: - raise ValueError( - _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)) - return target_info['status'] + return info['status'] + raise ValueError( + _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)) finish_stream = ( lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING) @@ -601,8 +609,8 @@ def _get_replica_status() -> serve_state.ReplicaStatus: finish_stream=finish_stream, exit_if_stream_end=not follow): print(line, end='', flush=True) - if not follow and _get_replica_status( - ) == serve_state.ReplicaStatus.PROVISIONING: + if (not follow and + _get_replica_status() == serve_state.ReplicaStatus.PROVISIONING): # Early exit if not following the logs. return '' @@ -659,14 +667,6 @@ def _service_is_terminal() -> bool: return '' -def cleanup_service_utility_files(service_name: str) -> None: - """Cleanup utility files for a service.""" - dir_name = generate_remote_service_dir_name(service_name) - dir_name = os.path.expanduser(dir_name) - if os.path.exists(dir_name): - shutil.rmtree(dir_name) - - class ServeCodeGen: """Code generator for SkyServe. @@ -697,10 +697,7 @@ def get_latest_info(cls, service_name: str) -> str: @classmethod def terminate_service(cls, service_name: str) -> str: - code = [ - f'msg = serve_utils.terminate_service({service_name!r})', - 'print(msg, end="", flush=True)' - ] + code = [f'serve_utils.terminate_service({service_name!r})'] return cls._build(code) @classmethod @@ -726,15 +723,6 @@ def stream_serve_process_logs(cls, service_name: str, ] return cls._build(code) - # TODO(tian): Move this into termination of controller - @classmethod - def cleanup_service(cls, service_name: str) -> str: - code = [ - f'serve_utils.cleanup_service_utility_files({service_name!r})', - f'serve_state.remove_service({service_name!r})', - ] - return cls._build(code) - @classmethod def _build(cls, code: List[str]) -> str: code = cls._PREFIX + code diff --git a/sky/serve/service.py b/sky/serve/service.py new file mode 100644 index 00000000000..90c23f8d287 --- /dev/null +++ b/sky/serve/service.py @@ -0,0 +1,181 @@ +"""Service: Control both the controller and load balancer.""" +import argparse +import multiprocessing +import os +import pathlib +import shutil +import time +from typing import Dict, List + +import filelock + +from sky import authentication +from sky import exceptions +from sky import serve +from sky import task as task_lib +from sky.backends import cloud_vm_ray_backend +from sky.serve import constants +from sky.serve import controller +from sky.serve import infra_providers +from sky.serve import load_balancer +from sky.serve import serve_state +from sky.serve import serve_utils +from sky.utils import subprocess_utils + + +def _handle_signal(service_name: str) -> None: + """Handles the signal user sent to controller.""" + signal_file = pathlib.Path(constants.SIGNAL_FILE_PATH.format(service_name)) + user_signal = None + if signal_file.exists(): + # Filelock is needed to prevent race condition with concurrent + # signal writing. + with filelock.FileLock(str(signal_file) + '.lock'): + with signal_file.open(mode='r') as f: + user_signal_text = f.read().strip() + try: + user_signal = serve_utils.UserSignal(user_signal_text) + except ValueError: + user_signal = None + # Remove the signal file, after reading it. + signal_file.unlink() + if user_signal is None: + return + assert isinstance(user_signal, serve_utils.UserSignal) + error_type = user_signal.error_type() + raise error_type(f'User signal received: {user_signal.value}') + + +def _cleanup(service_name: str, task_yaml: str) -> bool: + failed = False + replica_infos = serve_state.get_replica_infos(service_name) + info2proc: Dict[infra_providers.ReplicaInfo, + multiprocessing.Process] = dict() + for info in replica_infos: + p = multiprocessing.Process(target=infra_providers.terminate_cluster, + args=(info.cluster_name,)) + p.start() + info2proc[info] = p + info.status_property.sky_launch_status = ( + infra_providers.ProcessStatus.SUCCEEDED) + info.status_property.sky_down_status = ( + infra_providers.ProcessStatus.RUNNING) + serve_state.add_or_update_replica(service_name, info.replica_id, info) + for info, p in info2proc.items(): + p.join() + if p.exitcode == 0: + serve_state.remove_replica(service_name, info.replica_id) + else: + info.status_property.sky_down_status = ( + infra_providers.ProcessStatus.FAILED) + serve_state.add_or_update_replica(service_name, info.replica_id, + info) + failed = True + task = task_lib.Task.from_yaml(task_yaml) + backend = cloud_vm_ray_backend.CloudVmRayBackend() + backend.teardown_ephemeral_storage(task) + return failed + + +def _start(service_name: str, service_dir: str, task_yaml: str, + controller_port: int, load_balancer_port: int, + controller_log_file: str, load_balancer_log_file: str): + """Starts the service.""" + # Create the service working directory if it does not exist. + service_dir = os.path.expanduser(service_dir) + os.makedirs(service_dir, exist_ok=True) + + # Generate ssh key pair to avoid race condition when multiple sky.launch + # are executed at the same time. + authentication.get_or_generate_keys() + + service_spec = serve.SkyServiceSpec.from_yaml(task_yaml) + + controller_process = None + load_balancer_process = None + try: + _handle_signal(service_name) + # Start the controller. + controller_process = multiprocessing.Process( + target=serve_utils.RedirectOutputTo(controller.run_controller, + controller_log_file).run, + args=(service_name, service_spec, task_yaml, controller_port)) + controller_process.start() + + # Sleep for a while to make sure the controller is up. + time.sleep(10) + + # TODO(tian): Support HTTPS. + controller_addr = f'http://localhost:{controller_port}' + replica_port = int(service_spec.replica_port) + # Start the load balancer. + load_balancer_process = multiprocessing.Process( + target=serve_utils.RedirectOutputTo(load_balancer.run_load_balancer, + load_balancer_log_file).run, + args=(controller_addr, load_balancer_port, replica_port)) + load_balancer_process.start() + + while True: + _handle_signal(service_name) + time.sleep(1) + except exceptions.ServeUserTerminatedError: + serve_state.set_service_status(service_name, + serve_state.ServiceStatus.SHUTTING_DOWN) + finally: + process_to_kill: List[multiprocessing.Process] = [] + if load_balancer_process is not None: + process_to_kill.append(load_balancer_process) + if controller_process is not None: + process_to_kill.append(controller_process) + # Kill load balancer process first since it will raise errors if failed + # to connect to the controller. Then the controller process. + subprocess_utils.kill_children_processes( + [process.pid for process in process_to_kill], force=True) + for process in process_to_kill: + process.join() + failed = _cleanup(service_name, task_yaml) + if failed: + serve_state.set_service_status(service_name, + serve_state.ServiceStatus.FAILED) + else: + shutil.rmtree(service_dir) + serve_state.remove_service(service_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='SkyServe Controller') + parser.add_argument('--service-name', + type=str, + help='Name of the service', + required=True) + parser.add_argument('--service-dir', + type=str, + help='Working directory of the service', + required=True) + parser.add_argument('--task-yaml', + type=str, + help='Task YAML file', + required=True) + parser.add_argument('--controller-port', + type=int, + help='Port to run the controller', + required=True) + parser.add_argument('--load-balancer-port', + type=int, + help='Port to run the load balancer on.', + required=True) + parser.add_argument('--controller-log-file', + type=str, + help='Log file path for the controller', + required=True) + parser.add_argument('--load-balancer-log-file', + type=str, + help='Log file path for the load balancer', + required=True) + args = parser.parse_args() + # We start process with 'spawn', because 'fork' could result in weird + # behaviors; 'spawn' is also cross-platform. + multiprocessing.set_start_method('spawn', force=True) + _start(args.service_name, args.service_dir, args.task_yaml, + args.controller_port, args.load_balancer_port, + args.controller_log_file, args.load_balancer_log_file) diff --git a/sky/sky_logging.py b/sky/sky_logging.py index 04dad68c819..bbe77c9a14a 100644 --- a/sky/sky_logging.py +++ b/sky/sky_logging.py @@ -73,6 +73,18 @@ def _setup_logger(): _root_logger.propagate = False +def reload_logger(): + """Reload the logger. + + This is useful when the logging configuration is changed. + e.g., the logging level is changed or stdout/stderr is reset. + """ + global _default_handler + _root_logger.removeHandler(_default_handler) + _default_handler = None + _setup_logger() + + # The logger is initialized when the module is imported. # This is thread-safe as the module is only imported once, # guaranteed by the Python GIL. diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index d451618e28f..d47cfd1856e 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -16,20 +16,15 @@ file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} run: | - # Create working directory for this service. - mkdir -p {{service_dir}} - # Start sky serve controller. - python -u -m sky.serve.controller --service-name {{service_name}} \ - --task-yaml {{remote_task_yaml_path}} --controller-port {{controller_port}} \ - > {{controller_log_file}} 2>&1 & - # Wait for controller to start. - sleep 10 - # Start sky serve load balancer. We keep the load balancer running in the - # foreground so that the job will not finish, thus prevent our controller - # from auto down. - python -u -m sky.serve.load_balancer --load-balancer-port {{load_balancer_port}} \ - --replica-port {{replica_port}} --controller-addr \ - http://localhost:{{controller_port}} > {{load_balancer_log_file}} 2>&1 + # Start sky serve service. + python -u -m sky.serve.service \ + --service-name {{service_name}} \ + --service-dir {{service_dir}} \ + --task-yaml {{remote_task_yaml_path}} \ + --controller-port {{controller_port}} \ + --load-balancer-port {{load_balancer_port}} \ + --controller-log-file {{controller_log_file}} \ + --load-balancer-log-file {{load_balancer_log_file}} envs: {%- for env_name, env_value in envs.items() %} diff --git a/sky/utils/subprocess_utils.py b/sky/utils/subprocess_utils.py index 41b4c0cb555..decba03c5ef 100644 --- a/sky/utils/subprocess_utils.py +++ b/sky/utils/subprocess_utils.py @@ -81,8 +81,9 @@ def handle_returncode(returncode: int, stderr) -def kill_children_processes(first_pid_to_kill: Optional[int] = None, - force: bool = False): +def kill_children_processes( + first_pid_to_kill: Optional[Union[int, List[Optional[int]]]] = None, + force: bool = False): """Kill children processes recursively. We need to kill the children, so that @@ -92,35 +93,43 @@ def kill_children_processes(first_pid_to_kill: Optional[int] = None, etc. while we are cleaning up the clusters. Args: - first_pid_to_kill: Optional PID of a process to be killed first. + first_pid_to_kill: Optional PID of a process, or PIDs of a series of + processes to be killed first. If a list of PID is specified, it is + killed by the order in the list. This is for guaranteeing the order of cleaning up and suppress flaky errors. """ - parent_process = psutil.Process() + first_to_kill_pid_to_processes = dict() child_processes = [] - for child in parent_process.children(recursive=True): - if child.pid == first_pid_to_kill: + if isinstance(first_pid_to_kill, int): + first_pid_to_kill = [first_pid_to_kill] + elif first_pid_to_kill is None: + first_pid_to_kill = [] + + def _kill_processes(processes: List[psutil.Process]) -> None: + for process in processes: try: if force: - child.kill() + process.kill() else: - child.terminate() - child.wait() + process.terminate() except psutil.NoSuchProcess: - # The child process may have already been terminated. + # The process may have already been terminated. pass + + parent_process = psutil.Process() + for child in parent_process.children(recursive=True): + if child.pid in first_pid_to_kill: + first_to_kill_pid_to_processes[child.pid] = child else: child_processes.append(child) - for child in child_processes: - try: - if force: - child.kill() - else: - child.terminate() - except psutil.NoSuchProcess: - # The child process may have already been terminated. - pass + _kill_processes([ + first_to_kill_pid_to_processes[proc] + for proc in first_pid_to_kill + if proc in first_to_kill_pid_to_processes + ]) + _kill_processes(child_processes) def run_with_retries( From 8d15ef065d44d078df3707f2a06a0fc5195193b8 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 15 Oct 2023 23:02:41 -0700 Subject: [PATCH 126/223] move port selection to the controller VM --- sky/backends/backend_utils.py | 5 +- sky/backends/cloud_vm_ray_backend.py | 2 +- sky/core.py | 4 +- sky/execution.py | 40 +++--- sky/resources.py | 2 +- sky/serve/__init__.py | 5 +- sky/serve/autoscalers.py | 1 - sky/serve/constants.py | 15 ++- sky/serve/controller.py | 32 +---- sky/serve/infra_providers.py | 29 +---- sky/serve/load_balancer.py | 3 + sky/serve/serve_state.py | 64 +++++++-- sky/serve/serve_utils.py | 143 ++++++++++++--------- sky/serve/service.py | 122 +++++++++++------- sky/templates/sky-serve-controller.yaml.j2 | 6 +- sky/utils/cli_utils/status_utils.py | 6 +- 16 files changed, 267 insertions(+), 212 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 2245cfdca2f..dd76ccff1fb 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2719,6 +2719,8 @@ def _add_default_value_to_local_record( 'replica_info': [], 'uptime': None, 'status': serve_lib.ServiceStatus.UNKNOWN, + 'controller_port': None, + 'load_balancer_port': None, 'policy': '', 'auto_restart': False, 'requested_resources': sky.Resources(), @@ -2748,9 +2750,8 @@ def _refresh_service_record_no_lock( return record, 'Failed to refresh replica info due to network error.' service_handle: serve_lib.ServiceHandle = record['handle'] - if not service_handle.endpoint_ip: + if not service_handle.endpoint: # Service controller is still initializing. Skipped refresh status. - record['status'] = serve_lib.ServiceStatus.CONTROLLER_INIT return record, None controller_name = record['controller_name'] diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 7efdf8af1e5..12d88815dbf 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3351,7 +3351,7 @@ def _exec_code_on_head( if service_handle is not None: # Add the service to service table on controller VM. serve_code = serve_lib.ServeCodeGen.add_service( - job_id, service_handle) + job_id, service_handle.service_name) job_submit_cmd = job_submit_cmd + ' && ' + serve_code returncode, stdout, stderr = self.run_on_head(handle, diff --git a/sky/core.py b/sky/core.py index 39f426a8029..6ea5843d306 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1181,8 +1181,8 @@ def serve_down(service_name: str, purge: bool = False) -> None: # pylint: disable=broad-except except Exception as e: if purge: - logger.warning('Ignoring error when cleaning ' - f'replicas of {service_name!r}: ' + logger.warning('Ignoring error when terminate ' + f'service {service_name!r}: ' f'{common_utils.format_exception(e)}') else: with ux_utils.print_exception_no_traceback(): diff --git a/sky/execution.py b/sky/execution.py index 40c54474ee9..c9393e74df3 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1032,22 +1032,21 @@ def serve_up( os.path.expanduser(serve.CONTROLLER_FILE_LOCK_PATH), serve.CONTROLLER_FILE_LOCK_TIMEOUT): controller_name, _ = serve.get_available_controller_name() - controller_port, load_balancer_port = ( - serve.gen_ports_for_serve_process(controller_name)) service_handle = serve.ServiceHandle( service_name=service_name, - requested_controller_resources=controller_resources, - controller_port=controller_port, - load_balancer_port=load_balancer_port) + requested_controller_resources=controller_resources) global_user_state.add_or_update_service( service_name, launched_at=int(time.time()), controller_name=controller_name, handle=service_handle) + # TODO(tian): Probably run another sky.launch after we get + # the load balancer port from the controller? So we don't + # need to open so many ports here. controller_resources = controller_resources.copy( - ports=[load_balancer_port]) + ports=[serve.LOAD_BALANCER_PORT_RANGE]) except filelock.Timeout as e: with ux_utils.print_exception_no_traceback(): raise RuntimeError( @@ -1060,27 +1059,18 @@ def serve_up( with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', mode='w') as f: task_config = task.to_yaml_config() - if ('resources' in task_config and - 'spot_recovery' in task_config['resources']): - del task_config['resources']['spot_recovery'] common_utils.dump_yaml(f.name, task_config) remote_task_yaml_path = serve.generate_remote_task_yaml_file_name( service_name) controller_log_file = ( serve.generate_remote_controller_log_file_name(service_name)) - load_balancer_log_file = ( - serve.generate_remote_load_balancer_log_file_name(service_name)) vars_to_fill = { 'remote_task_yaml_path': remote_task_yaml_path, 'local_task_yaml_path': f.name, 'google_sdk_installation_commands': gcp.GOOGLE_SDK_INSTALLATION_COMMAND, - 'service_dir': serve.generate_remote_service_dir_name(service_name), 'service_name': service_name, - 'controller_port': controller_port, - 'load_balancer_port': load_balancer_port, 'controller_log_file': controller_log_file, - 'load_balancer_log_file': load_balancer_log_file, 'envs': _shared_controller_env_vars(), } controller_yaml_path = serve.generate_controller_yaml_file_name( @@ -1118,7 +1108,23 @@ def serve_up( assert controller_record is not None handle = controller_record['handle'] assert isinstance(handle, backends.CloudVmRayResourceHandle) - service_handle.endpoint_ip = handle.head_ip + with rich_utils.safe_status( + '[cyan]Waiting for service initialization...[/]'): + code = serve.ServeCodeGen.wait_for_load_balancer_port(service_name) + backend = backends.CloudVmRayBackend() + returncode, lb_port_payload, stderr = backend.run_on_head( + handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + subprocess_utils.handle_returncode( + returncode, code, + ('Failed to get load balancer port for service ' + f'{service_name!r}.'), stderr) + load_balancer_port = serve.decode_load_balancer_port( + lb_port_payload) + service_handle.endpoint = f'{handle.head_ip}:{load_balancer_port}' global_user_state.set_service_handle(service_name, service_handle) print(f'{fore.GREEN}Launching controller for {service_name!r}...done.' @@ -1145,7 +1151,7 @@ def serve_up( f'{backend_utils.RESET_BOLD} to get all valid REPLICA_ID)') print(f'\n{style.BRIGHT}{fore.CYAN}Endpoint URL: ' f'{style.RESET_ALL}{fore.CYAN}' - f'{handle.head_ip}:{load_balancer_port}{style.RESET_ALL}') + f'{service_handle.endpoint}{style.RESET_ALL}') print(f'{fore.GREEN}Starting replicas now...{style.RESET_ALL}') print('\nTo monitor replica status:' f'\t{backend_utils.BOLD}watch -n10 sky serve status ' diff --git a/sky/resources.py b/sky/resources.py index 38c677485c9..b4ca502f738 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -1164,7 +1164,7 @@ def add_if_not_none(key, value): if self._use_spot_specified: add_if_not_none('use_spot', self.use_spot) - config['spot_recovery'] = self.spot_recovery + add_if_not_none('spot_recovery', self.spot_recovery) config['disk_size'] = self.disk_size add_if_not_none('region', self.region) add_if_not_none('zone', self.zone) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index fe81076f411..ea177ceee76 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -9,15 +9,14 @@ from sky.serve.constants import CONTROLLER_SYNC_INTERVAL from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.constants import ENDPOINT_PROBE_INTERVAL +from sky.serve.constants import LOAD_BALANCER_PORT_RANGE from sky.serve.constants import SERVE_PREFIX from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.serve_state import ReplicaStatus from sky.serve.serve_state import ServiceStatus -from sky.serve.serve_utils import gen_ports_for_serve_process +from sky.serve.serve_utils import decode_load_balancer_port from sky.serve.serve_utils import generate_controller_yaml_file_name from sky.serve.serve_utils import generate_remote_controller_log_file_name -from sky.serve.serve_utils import generate_remote_load_balancer_log_file_name -from sky.serve.serve_utils import generate_remote_service_dir_name from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name from sky.serve.serve_utils import get_available_controller_name diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 88fb2fdb463..2857dfdaa91 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -54,7 +54,6 @@ def __init__(self, spec: 'service_spec.SkyServiceSpec', self.auto_restart = spec.auto_restart self.min_replicas: int = spec.min_replicas self.max_replicas: int = spec.max_replicas or spec.min_replicas - self.policy_str = spec.policy_str() self.frequency = frequency if self.frequency < constants.CONTROLLER_SYNC_INTERVAL: logger.warning('Autoscaler frequency is less than ' diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 7db4cb248ed..c7b74f3557e 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -12,19 +12,25 @@ SERVE_PREFIX = '~/.sky/serve' # The filelock for selecting controller and service ports when starting a -# service. In our current multi-service controller implementation, we need to: -# 1. Select a controller if there are some existing controllers; -# 2. Select ports for each service atomically to avoid port conflicts. -# All of them are protected by this file lock from race conditions. +# service. In our current multi-service controller implementation, we need to +# select a controller if there are some existing controllers. CONTROLLER_FILE_LOCK_PATH = f'{SERVE_PREFIX}/controller.lock' CONTROLLER_FILE_LOCK_TIMEOUT = 20 +# The filelock for selecting service ports when starting a service. We need to +# have a filelock to avoid port collision when starting multiple services at +# the same time. +PORT_SELECTION_FILE_LOCK_PATH = f'{SERVE_PREFIX}/port_selection.lock' + # Signal file path for controller to handle signals. SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}' # Timeout for `sky serve down`. SERVICE_TERMINATION_TIMEOUT = 180 +# Timeout for waiting controller to find a port for service processes. +SERVICE_PORT_SELECTION_TIMEOUT = 20 + # The time interval for load balancer to sync with controller. Every time the # load balancer syncs with controller, it will update all available replica ips # for each service, also send the number of requests in last query interval. @@ -79,3 +85,4 @@ # automatically generated from this start port. CONTROLLER_PORT_START = 20001 LOAD_BALANCER_PORT_START = 30001 +LOAD_BALANCER_PORT_RANGE = '30001-30100' diff --git a/sky/serve/controller.py b/sky/serve/controller.py index c05cd121d85..8d730f78568 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -17,7 +17,6 @@ from sky.serve import autoscalers from sky.serve import constants from sky.serve import infra_providers -from sky.serve import serve_state from sky.serve import serve_utils from sky.utils import env_options @@ -44,6 +43,7 @@ class SkyServeController: def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec, task_yaml: str, port: int) -> None: + self.service_name = service_name self.infra_provider: infra_providers.InfraProvider = ( infra_providers.SkyPilotInfraProvider(service_name, service_spec, @@ -61,8 +61,9 @@ def _run_autoscaler(self): logger.info('Starting autoscaler monitor.') while True: try: - replica_info = self.infra_provider.get_replica_info( - verbose=env_options.Options.SHOW_DEBUG_INFO.get()) + replica_info = serve_utils.get_replica_info( + self.service_name, + with_handle=env_options.Options.SHOW_DEBUG_INFO.get()) logger.info(f'All replica info: {replica_info}') scaling_option = self.autoscaler.evaluate_scaling(replica_info) if (scaling_option.operator == @@ -101,31 +102,6 @@ def load_balancer_sync(request: fastapi.Request): self.autoscaler.update_request_information(request_information) return {'ready_replicas': self.infra_provider.get_ready_replicas()} - @self.app.get('/controller/get_latest_info') - def get_latest_info(): - # NOTE(dev): Keep this align with - # sky.backends.backend_utils._add_default_value_to_local_record - record = serve_state.get_service_from_name( - self.infra_provider.service_name) - if record is None: - record = {} - latest_info = { - 'replica_info': - self.infra_provider.get_replica_info(verbose=True), - 'uptime': record.get('uptime', None), - 'status': record.get('status', - serve_state.ServiceStatus.UNKNOWN), - 'policy': self.autoscaler.policy_str, - 'auto_restart': self.autoscaler.auto_restart, - 'requested_resources': - self.infra_provider.get_requested_resources(), - } - latest_info = { - k: base64.b64encode(pickle.dumps(v)).decode('utf-8') - for k, v in latest_info.items() - } - return latest_info - threading.Thread(target=self._run_autoscaler).start() # Disable all GET logs if SKYPILOT_DEBUG is not set to avoid overflowing diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index ac750d26fb2..533b4f5cc9b 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -15,12 +15,10 @@ import psutil import requests -import yaml import sky from sky import backends from sky import global_user_state -from sky import resources from sky.backends import backend_utils from sky.serve import constants as serve_constants from sky.serve import serve_state @@ -285,18 +283,10 @@ def __init__(self, service_name: str, logger.info(f'Initial delay seconds: {self.initial_delay_seconds}') logger.info(f'Post data: {self.post_data} ({type(self.post_data)})') - def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: - # Get replica info for all replicas - raise NotImplementedError - def get_ready_replicas(self) -> Set[str]: # Returns the endpoints of all ready replicas raise NotImplementedError - def get_requested_resources(self) -> resources.Resources: - # Returns the requested resources for the service - raise NotImplementedError - def scale_up(self, n: int) -> None: raise NotImplementedError @@ -433,12 +423,6 @@ def _job_status_fetcher(self) -> None: logger.error(f'Error in job status fetcher: {e}') time.sleep(_JOB_STATUS_FETCH_INTERVAL) - def get_replica_info(self, verbose: bool) -> List[Dict[str, Any]]: - return [ - info.to_info_dict(with_handle=verbose) - for info in serve_state.get_replica_infos(self.service_name) - ] - def get_ready_replicas(self) -> Set[str]: ready_replicas = set() infos = serve_state.get_replica_infos(self.service_name) @@ -448,16 +432,6 @@ def get_ready_replicas(self) -> Set[str]: ready_replicas.add(info.ip) return ready_replicas - def get_requested_resources(self) -> resources.Resources: - with open(self.task_yaml_path, 'r') as f: - config = yaml.safe_load(f) - resources_config = None - if isinstance(config, dict): - resources_config = config.get('resources') - if resources_config is None: - return resources.Resources() - return resources.Resources.from_yaml_config(resources_config) - def _launch_replica(self, replica_id: int) -> None: cluster_name = serve_utils.generate_replica_cluster_name( self.service_name, replica_id) @@ -555,7 +529,8 @@ def _replica_prober(self) -> None: self._probe_all_replicas() replica_statuses = [ info['status'] - for info in self.get_replica_info(verbose=False) + for info in serve_utils.get_replica_info(self.service_name, + with_handle=False) ] serve_utils.set_service_status_from_replica_statuses( self.service_name, replica_statuses) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index bb64f71aa92..1bc84a70037 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -48,6 +48,9 @@ def _sync_with_controller(self): available replicas; also, it report the request information to the controller, so that the controller can make autoscaling decisions. """ + # Sleep for a while to wait the controller bootstrap. + time.sleep(5) + while True: with requests.Session() as session: try: diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index ce467e793b4..a3fbf1bca6d 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -13,6 +13,7 @@ from sky.utils import db_utils if typing.TYPE_CHECKING: + import sky from sky.serve import infra_providers _DB_PATH = pathlib.Path(constants.SERVE_PREFIX) / 'services.db' @@ -30,8 +31,12 @@ name TEXT PRIMARY KEY, controller_job_id INTEGER, controller_port INTEGER, + load_balancer_port INTEGER, status TEXT, - uptime INTEGER DEFAULT NULL)""") + uptime INTEGER DEFAULT NULL, + policy TEXT, + auto_restart INTEGER, + requested_resources BLOB)""") _CURSOR.execute("""\ CREATE TABLE IF NOT EXISTS replicas ( service_name TEXT, @@ -121,9 +126,12 @@ class ServiceStatus(enum.Enum): # At least one replica is failed and no replica is ready FAILED = 'FAILED' + # Clean up failed + FAILED_CLEANUP = 'FAILED_CLEANUP' + @classmethod def failed_statuses(cls) -> List['ServiceStatus']: - return [cls.CONTROLLER_FAILED, cls.UNKNOWN, cls.FAILED] + return [cls.CONTROLLER_FAILED, cls.UNKNOWN, cls.FAILED_CLEANUP] def colored_str(self) -> str: color = _SERVICE_STATUS_TO_COLOR[self] @@ -150,19 +158,32 @@ def from_replica_statuses( ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, ServiceStatus.UNKNOWN: colorama.Fore.YELLOW, ServiceStatus.FAILED: colorama.Fore.RED, + ServiceStatus.FAILED_CLEANUP: colorama.Fore.RED, } # === Service functions === -def add_service(job_id: int, service_name: str, controller_port: int) -> None: +def add_or_update_service( + controller_job_id: int, + name: str, + controller_port: Optional[int] = None, + load_balancer_port: Optional[int] = None, + status: ServiceStatus = ServiceStatus.CONTROLLER_INIT, + uptime: Optional[int] = None, + policy: Optional[str] = None, + auto_restart: bool = False, + requested_resources: Optional['sky.Resources'] = None) -> None: """Adds a service to the database.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( """\ - INSERT INTO services - (name, controller_job_id, controller_port, status) - VALUES (?, ?, ?, ?)""", (service_name, job_id, controller_port, - ServiceStatus.CONTROLLER_INIT.value)) + INSERT OR REPLACE INTO services + (name, controller_job_id, controller_port, load_balancer_port, + status, uptime, policy, auto_restart, requested_resources) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (name, controller_job_id, controller_port, load_balancer_port, + status.value, uptime, policy, int(auto_restart), + pickle.dumps(requested_resources))) def remove_service(service_name: str) -> None: @@ -190,14 +211,41 @@ def set_service_status(service_name: str, status: ServiceStatus) -> None: status=(?) WHERE name=(?)""", (status.value, service_name)) +def set_service_controller_port(service_name: str, + controller_port: int) -> None: + """Sets the controller port of a service.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute( + """\ + UPDATE services SET + controller_port=(?) WHERE name=(?)""", + (controller_port, service_name)) + + +def set_service_load_balancer_port(service_name: str, + load_balancer_port: int) -> None: + """Sets the load balancer port of a service.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + cursor.execute( + """\ + UPDATE services SET + load_balancer_port=(?) WHERE name=(?)""", + (load_balancer_port, service_name)) + + def _get_service_from_row(row) -> Dict[str, Any]: - name, controller_job_id, controller_port, status, uptime = row[:5] + (name, controller_job_id, controller_port, load_balancer_port, status, + uptime, policy, auto_restart, requested_resources) = row[:9] return { 'name': name, 'controller_job_id': controller_job_id, 'controller_port': controller_port, + 'load_balancer_port': load_balancer_port, 'status': ServiceStatus[status], 'uptime': uptime, + 'policy': policy, + 'auto_restart': bool(auto_restart), + 'requested_resources': pickle.loads(requested_resources), } diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 82facca7a61..e9e278a4e48 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -14,7 +14,6 @@ import colorama import filelock -import requests from sky import backends from sky import exceptions @@ -159,10 +158,10 @@ def __init__(self, func: Callable, file: str) -> None: self.file = file def run(self, *args, **kwargs): - # pylint: disable=import-outside-toplevel - import sys + import sys # pylint: disable=import-outside-toplevel + + from sky import sky_logging # pylint: disable=import-outside-toplevel - from sky import sky_logging with open(self.file, 'w') as f: sys.stdout = f sys.stderr = f @@ -265,27 +264,6 @@ def generate_replica_cluster_name(service_name: str, replica_id: int) -> str: return f'{service_name}-{replica_id}' -def get_replica_id_from_cluster_name(cluster_name: str) -> int: - return int(cluster_name.split('-')[-1]) - - -def gen_ports_for_serve_process(controller_name: str) -> Tuple[int, int]: - services = global_user_state.get_services_from_controller_name( - controller_name) - existing_controller_ports, existing_load_balancer_ports = set(), set() - for service in services: - service_handle: ServiceHandle = service['handle'] - existing_controller_ports.add(service_handle.controller_port) - existing_load_balancer_ports.add(service_handle.load_balancer_port) - controller_port = constants.CONTROLLER_PORT_START - while controller_port in existing_controller_ports: - controller_port += 1 - load_balancer_port = constants.LOAD_BALANCER_PORT_START - while load_balancer_port in existing_load_balancer_ports: - load_balancer_port += 1 - return controller_port, load_balancer_port - - def _get_service_slot_on_controller(controller_name: str) -> int: """Get the number of slots to run services on the controller. @@ -400,8 +378,6 @@ class ServiceHandle(object): - (required) Service name. - (required) Service requested controller resources. - - (required) Controller port. - - (required) LoadBalancer port. - (optional) Service endpoint IP. This class is only used as a cache for information fetched from controller. @@ -413,50 +389,67 @@ def __init__( *, service_name: str, requested_controller_resources: 'sky.Resources', - controller_port: int, - load_balancer_port: int, - endpoint_ip: Optional[str] = None, + endpoint: Optional[str] = None, ) -> None: self._version = self._VERSION self.service_name = service_name self.requested_controller_resources = requested_controller_resources - self.controller_port = controller_port - self.load_balancer_port = load_balancer_port - self.endpoint_ip = endpoint_ip + self.endpoint = endpoint def __repr__(self) -> str: return ('ServiceHandle(' f'\n\tservice_name={self.service_name},' '\n\trequested_controller_resources=' f'{self.requested_controller_resources},' - f'\n\tcontroller_port={self.controller_port},' - f'\n\tload_balancer_port={self.load_balancer_port},' - f'\n\tendpoint_ip={self.endpoint_ip})') + f'\n\tendpoint={self.endpoint})') def __setstate__(self, state): self._version = self._VERSION self.__dict__.update(state) -def _get_latest_info(service_name: str, decode: bool = True) -> Dict[str, Any]: +def get_replica_info(service_name: str, + with_handle: bool) -> List[Dict[str, Any]]: + """Get the information of all replicas of the service. + + Args: + service_name: The name of the service. + with_handle: Whether to include the handle of the replica. + + Returns: + A list of dictionaries of replica information. + """ + return [ + info.to_info_dict(with_handle=with_handle) + for info in serve_state.get_replica_infos(service_name) + ] + + +def get_latest_info(service_name: str) -> Dict[str, Any]: + """Get the latest information of the service. + + Args: + service_name: The name of the service. + + Returns: + A dictionary of latest information of the service. + """ + # NOTE(dev): Keep this align with + # sky.backends.backend_utils._add_default_value_to_local_record record = serve_state.get_service_from_name(service_name) if record is None: raise ValueError(f'Service {service_name!r} does not exist.') - controller_port = record['controller_port'] - resp = requests.get( - _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) + - '/controller/get_latest_info') - resp.raise_for_status() - if not decode: - return resp.json() - return { - k: pickle.loads(base64.b64decode(v)) for k, v in resp.json().items() - } + record['replica_info'] = get_replica_info(service_name, with_handle=True) + return record -def get_latest_info(service_name: str) -> str: - return common_utils.encode_payload( - _get_latest_info(service_name, decode=False)) +def get_latest_info_encoded(service_name: str) -> str: + latest_info = get_latest_info(service_name) + latest_info = { + k: base64.b64encode(pickle.dumps(v)).decode('utf-8') + for k, v in latest_info.items() + } + return common_utils.encode_payload(latest_info) def load_latest_info(payload: str) -> Dict[str, Any]: @@ -480,8 +473,19 @@ def terminate_service(service_name: str) -> None: for _ in range(constants.SERVICE_TERMINATION_TIMEOUT): record = serve_state.get_service_from_name(service_name) if record is None: - break + return + if record['status'] == serve_state.ServiceStatus.FAILED_CLEANUP: + raise RuntimeError( + f'Failed to terminate service {service_name!r}. Some ' + 'resources are not cleaned up properly. Please SSH to ' + 'the controller and manually clean up them. Find the ' + 'replicas that not been terminated by `sky serve status ' + f'{service_name!r}`.') time.sleep(1) + raise RuntimeError( + f'Failed to terminate service {service_name!r}: timeout ' + f'after {constants.SERVICE_TERMINATION_TIMEOUT} seconds. ' + 'Please try again later.') def check_service_status_healthy(service_name: str) -> Optional[str]: @@ -593,8 +597,7 @@ def stream_replica_logs(service_name: str, f'{colorama.Style.RESET_ALL}') def _get_replica_status() -> serve_state.ReplicaStatus: - latest_info = _get_latest_info(service_name) - replica_info = latest_info['replica_info'] + replica_info = get_replica_info(service_name, with_handle=False) for info in replica_info: if info['replica_id'] == replica_id: return info['status'] @@ -667,6 +670,24 @@ def _service_is_terminal() -> bool: return '' +def wait_for_load_balancer_port(service_name: str) -> str: + # Sleep for a while to bootstrap the load balancer. + time.sleep(5) + for _ in range(constants.SERVICE_PORT_SELECTION_TIMEOUT): + latest_info = get_latest_info(service_name) + load_balancer_port = latest_info['load_balancer_port'] + if load_balancer_port is not None: + return common_utils.encode_payload(load_balancer_port) + time.sleep(1) + raise RuntimeError( + f'Failed to get load balancer port for service {service_name!r}: ' + f'timeout after {constants.SERVICE_PORT_SELECTION_TIMEOUT} seconds.') + + +def decode_load_balancer_port(payload: str) -> str: + return common_utils.decode_payload(payload) + + class ServeCodeGen: """Code generator for SkyServe. @@ -679,18 +700,16 @@ class ServeCodeGen: ] @classmethod - def add_service(cls, job_id: int, service_handle: ServiceHandle) -> str: + def add_service(cls, job_id: int, service_name: str) -> str: code = [ - f'serve_state.add_service({job_id}, ' - f'{service_handle.service_name!r}, ' - f'{service_handle.controller_port})', + f'serve_state.add_or_update_service({job_id}, {service_name!r})', ] return cls._build(code) @classmethod def get_latest_info(cls, service_name: str) -> str: code = [ - f'msg = serve_utils.get_latest_info({service_name!r})', + f'msg = serve_utils.get_latest_info_encoded({service_name!r})', 'print(msg, end="", flush=True)' ] return cls._build(code) @@ -723,6 +742,14 @@ def stream_serve_process_logs(cls, service_name: str, ] return cls._build(code) + @classmethod + def wait_for_load_balancer_port(cls, service_name: str) -> str: + code = [ + f'msg = serve_utils.wait_for_load_balancer_port({service_name!r})', + 'print(msg, flush=True)' + ] + return cls._build(code) + @classmethod def _build(cls, code: List[str]) -> str: code = cls._PREFIX + code diff --git a/sky/serve/service.py b/sky/serve/service.py index 90c23f8d287..038d1bfdad7 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -8,10 +8,13 @@ from typing import Dict, List import filelock +import yaml from sky import authentication from sky import exceptions +from sky import resources from sky import serve +from sky import sky_logging from sky import task as task_lib from sky.backends import cloud_vm_ray_backend from sky.serve import constants @@ -20,8 +23,14 @@ from sky.serve import load_balancer from sky.serve import serve_state from sky.serve import serve_utils +from sky.utils import common_utils from sky.utils import subprocess_utils +# Use the explicit logger name so that the logger is under the +# `sky.serve.service` namespace when executed directly, so as +# to inherit the setup from the `sky` logger. +logger = sky_logging.init_logger('sky.serve.service') + def _handle_signal(service_name: str) -> None: """Handles the signal user sent to controller.""" @@ -35,7 +44,10 @@ def _handle_signal(service_name: str) -> None: user_signal_text = f.read().strip() try: user_signal = serve_utils.UserSignal(user_signal_text) + logger.info(f'User signal received: {user_signal}') except ValueError: + logger.warning( + f'Unknown signal received: {user_signal}. Ignoring.') user_signal = None # Remove the signal file, after reading it. signal_file.unlink() @@ -47,6 +59,7 @@ def _handle_signal(service_name: str) -> None: def _cleanup(service_name: str, task_yaml: str) -> bool: + """Clean up the sky serve replicas, storage, and service record.""" failed = False replica_infos = serve_state.get_replica_infos(service_name) info2proc: Dict[infra_providers.ReplicaInfo, @@ -56,64 +69,95 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: args=(info.cluster_name,)) p.start() info2proc[info] = p + # Set replica status to `SHUTTING_DOWN` info.status_property.sky_launch_status = ( infra_providers.ProcessStatus.SUCCEEDED) info.status_property.sky_down_status = ( infra_providers.ProcessStatus.RUNNING) serve_state.add_or_update_replica(service_name, info.replica_id, info) + logger.info(f'Terminating replica {info.replica_id} ...') for info, p in info2proc.items(): p.join() if p.exitcode == 0: serve_state.remove_replica(service_name, info.replica_id) + logger.info(f'Replica {info.replica_id} terminated successfully.') else: + # Set replica status to `FAILED_CLEANUP` info.status_property.sky_down_status = ( infra_providers.ProcessStatus.FAILED) serve_state.add_or_update_replica(service_name, info.replica_id, info) failed = True + logger.error(f'Replica {info.replica_id} failed to terminate.') task = task_lib.Task.from_yaml(task_yaml) backend = cloud_vm_ray_backend.CloudVmRayBackend() backend.teardown_ephemeral_storage(task) return failed -def _start(service_name: str, service_dir: str, task_yaml: str, - controller_port: int, load_balancer_port: int, - controller_log_file: str, load_balancer_log_file: str): +def _start(service_name: str, task_yaml: str): """Starts the service.""" - # Create the service working directory if it does not exist. - service_dir = os.path.expanduser(service_dir) + # Generate log file name. + load_balancer_log_file = os.path.expanduser( + serve_utils.generate_remote_load_balancer_log_file_name(service_name)) + + # Create the service working directory. + service_dir = os.path.expanduser( + serve_utils.generate_remote_service_dir_name(service_name)) os.makedirs(service_dir, exist_ok=True) # Generate ssh key pair to avoid race condition when multiple sky.launch # are executed at the same time. authentication.get_or_generate_keys() + # Store service information in the serve state. service_spec = serve.SkyServiceSpec.from_yaml(task_yaml) + record = serve_state.get_service_from_name(service_name) + if record is None: + raise ValueError(f'Service {service_name} does not exist.') + record['policy'] = service_spec.policy_str() + record['auto_restart'] = service_spec.auto_restart + with open(task_yaml, 'r') as f: + config = yaml.safe_load(f) + resources_config = None + if isinstance(config, dict): + resources_config = config.get('resources') + requested_resources = resources.Resources.from_yaml_config(resources_config) + record['requested_resources'] = requested_resources + serve_state.add_or_update_service(**record) controller_process = None load_balancer_process = None try: _handle_signal(service_name) - # Start the controller. - controller_process = multiprocessing.Process( - target=serve_utils.RedirectOutputTo(controller.run_controller, - controller_log_file).run, - args=(service_name, service_spec, task_yaml, controller_port)) - controller_process.start() - - # Sleep for a while to make sure the controller is up. - time.sleep(10) - - # TODO(tian): Support HTTPS. - controller_addr = f'http://localhost:{controller_port}' - replica_port = int(service_spec.replica_port) - # Start the load balancer. - load_balancer_process = multiprocessing.Process( - target=serve_utils.RedirectOutputTo(load_balancer.run_load_balancer, - load_balancer_log_file).run, - args=(controller_addr, load_balancer_port, replica_port)) - load_balancer_process.start() + + with filelock.FileLock( + os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)): + controller_port = common_utils.find_free_port( + constants.CONTROLLER_PORT_START) + # Start the controller. + controller_process = multiprocessing.Process( + target=controller.run_controller, + args=(service_name, service_spec, task_yaml, controller_port)) + controller_process.start() + serve_state.set_service_controller_port(service_name, + controller_port) + + # TODO(tian): Support HTTPS. + controller_addr = f'http://localhost:{controller_port}' + replica_port = int(service_spec.replica_port) + load_balancer_port = common_utils.find_free_port( + constants.LOAD_BALANCER_PORT_START) + + # Start the load balancer. + load_balancer_process = multiprocessing.Process( + target=serve_utils.RedirectOutputTo( + load_balancer.run_load_balancer, + load_balancer_log_file).run, + args=(controller_addr, load_balancer_port, replica_port)) + load_balancer_process.start() + serve_state.set_service_load_balancer_port(service_name, + load_balancer_port) while True: _handle_signal(service_name) @@ -135,47 +179,25 @@ def _start(service_name: str, service_dir: str, task_yaml: str, process.join() failed = _cleanup(service_name, task_yaml) if failed: - serve_state.set_service_status(service_name, - serve_state.ServiceStatus.FAILED) + serve_state.set_service_status( + service_name, serve_state.ServiceStatus.FAILED_CLEANUP) else: shutil.rmtree(service_dir) serve_state.remove_service(service_name) if __name__ == '__main__': - parser = argparse.ArgumentParser(description='SkyServe Controller') + parser = argparse.ArgumentParser(description='Sky Serve Service') parser.add_argument('--service-name', type=str, help='Name of the service', required=True) - parser.add_argument('--service-dir', - type=str, - help='Working directory of the service', - required=True) parser.add_argument('--task-yaml', type=str, help='Task YAML file', required=True) - parser.add_argument('--controller-port', - type=int, - help='Port to run the controller', - required=True) - parser.add_argument('--load-balancer-port', - type=int, - help='Port to run the load balancer on.', - required=True) - parser.add_argument('--controller-log-file', - type=str, - help='Log file path for the controller', - required=True) - parser.add_argument('--load-balancer-log-file', - type=str, - help='Log file path for the load balancer', - required=True) args = parser.parse_args() # We start process with 'spawn', because 'fork' could result in weird # behaviors; 'spawn' is also cross-platform. multiprocessing.set_start_method('spawn', force=True) - _start(args.service_name, args.service_dir, args.task_yaml, - args.controller_port, args.load_balancer_port, - args.controller_log_file, args.load_balancer_log_file) + _start(args.service_name, args.task_yaml) diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index d47cfd1856e..3ab2e4b6e0a 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -19,12 +19,8 @@ run: | # Start sky serve service. python -u -m sky.serve.service \ --service-name {{service_name}} \ - --service-dir {{service_dir}} \ --task-yaml {{remote_task_yaml_path}} \ - --controller-port {{controller_port}} \ - --load-balancer-port {{load_balancer_port}} \ - --controller-log-file {{controller_log_file}} \ - --load-balancer-log-file {{load_balancer_log_file}} + > {{controller_log_file}} 2>&1 envs: {%- for env_name, env_value in envs.items() %} diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index cd31ca49f3e..1ec0a2d9a4b 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -422,11 +422,7 @@ def _get_replicas(service_record: _ServiceRecord) -> str: def get_endpoint(service_record: _ServiceRecord) -> Optional[str]: handle = _get_service_handle(service_record) - if handle.endpoint_ip is None: - return None - if handle.load_balancer_port is None: - return None - return f'{handle.endpoint_ip}:{handle.load_balancer_port}' + return handle.endpoint def _get_display_endpoint(service_record: _ServiceRecord) -> str: From 5668049171552ab491b4d718410dbb9a282751b2 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 15 Oct 2023 23:25:56 -0700 Subject: [PATCH 127/223] minor --- sky/backends/cloud_vm_ray_backend.py | 12 ++--------- sky/serve/autoscalers.py | 4 ++-- sky/serve/constants.py | 2 +- sky/serve/controller.py | 5 +---- sky/serve/infra_providers.py | 4 ++-- sky/serve/load_balancer.py | 5 +---- sky/serve/load_balancing_policies.py | 5 +++-- sky/serve/serve_state.py | 3 ++- sky/serve/serve_utils.py | 21 +++++++++---------- sky/serve/service.py | 24 +++++++++++++--------- sky/templates/sky-serve-controller.yaml.j2 | 1 + 11 files changed, 39 insertions(+), 47 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 12d88815dbf..9568bba3db7 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3277,7 +3277,6 @@ def _exec_code_on_head( executable: str, detach_run: bool = False, spot_dag: Optional['dag.Dag'] = None, - service_handle: Optional['serve_lib.ServiceHandle'] = None, ) -> None: """Executes generated code on the head node.""" style = colorama.Style @@ -3348,11 +3347,6 @@ def _exec_code_on_head( # the controller process job, as it will stay in the job pending # table and not be executed until there is an empty slot. job_submit_cmd = job_submit_cmd + ' && ' + spot_code - if service_handle is not None: - # Add the service to service table on controller VM. - serve_code = serve_lib.ServeCodeGen.add_service( - job_id, service_handle.service_name) - job_submit_cmd = job_submit_cmd + ' && ' + serve_code returncode, stdout, stderr = self.run_on_head(handle, job_submit_cmd, @@ -4728,8 +4722,7 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, job_id, executable='python3', detach_run=detach_run, - spot_dag=task.spot_dag, - service_handle=task.service_handle) + spot_dag=task.spot_dag) def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, task: task_lib.Task, job_id: int, @@ -4803,5 +4796,4 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, job_id, executable='python3', detach_run=detach_run, - spot_dag=task.spot_dag, - service_handle=task.service_handle) + spot_dag=task.spot_dag) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 2857dfdaa91..533ba86b7b8 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -2,11 +2,11 @@ import bisect import dataclasses import enum -import logging import time import typing from typing import Any, Dict, List, Optional, Union +from sky import sky_logging from sky.serve import constants from sky.serve import serve_state from sky.serve import serve_utils @@ -14,7 +14,7 @@ if typing.TYPE_CHECKING: from sky.serve import service_spec -logger = logging.getLogger(__name__) +logger = sky_logging.init_logger(__name__) # Since sky.launch is very resource demanding, we limit the number of # concurrent sky.launch process to avoid overloading the machine. diff --git a/sky/serve/constants.py b/sky/serve/constants.py index c7b74f3557e..59a9e2f7830 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -29,7 +29,7 @@ SERVICE_TERMINATION_TIMEOUT = 180 # Timeout for waiting controller to find a port for service processes. -SERVICE_PORT_SELECTION_TIMEOUT = 20 +SERVICE_PORT_SELECTION_TIMEOUT = 60 # The time interval for load balancer to sync with controller. Every time the # load balancer syncs with controller, it will update all available replica ips diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 8d730f78568..8ee2ac5c51b 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -20,10 +20,7 @@ from sky.serve import serve_utils from sky.utils import env_options -# Use the explicit logger name so that the logger is under the -# `sky.serve.controller` namespace when executed directly, so as -# to inherit the setup from the `sky` logger. -logger = sky_logging.init_logger('sky.serve.controller') +logger = sky_logging.init_logger(__name__) class SuppressSuccessGetAccessLogsFilter(logging.Filter): diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 533b4f5cc9b..0e32deee0f9 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -3,7 +3,6 @@ import dataclasses import enum import functools -import logging import os import signal import subprocess @@ -19,6 +18,7 @@ import sky from sky import backends from sky import global_user_state +from sky import sky_logging from sky.backends import backend_utils from sky.serve import constants as serve_constants from sky.serve import serve_state @@ -31,7 +31,7 @@ if typing.TYPE_CHECKING: from sky.serve import service_spec -logger = logging.getLogger(__name__) +logger = sky_logging.init_logger(__name__) _JOB_STATUS_FETCH_INTERVAL = 30 _PROCESS_POOL_REFRESH_INTERVAL = 20 diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 1bc84a70037..b82debb18c0 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -13,10 +13,7 @@ from sky.serve import load_balancing_policies as lb_policies from sky.serve import serve_utils -# Use the explicit logger name so that the logger is under the -# `sky.serve.load_balancer` namespace when executed directly, so as -# to inherit the setup from the `sky` logger. -logger = sky_logging.init_logger('sky.serve.load_balancer') +logger = sky_logging.init_logger(__name__) class SkyServeLoadBalancer: diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index bfb01661aed..4aa3e46f5f4 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -1,10 +1,11 @@ """LoadBalancingPolicy: Policy to select endpoint.""" -import logging from typing import List, Optional, Set import fastapi -logger = logging.getLogger(__name__) +from sky import sky_logging + +logger = sky_logging.init_logger(__name__) class LoadBalancingPolicy: diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index a3fbf1bca6d..818bf20b6e1 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -26,6 +26,7 @@ _CONN = sqlite3.connect(_DB_PATH) _CURSOR = _CONN.cursor() +# TODO(tian): Probably change back to ServiceHandle... _CURSOR.execute("""\ CREATE TABLE IF NOT EXISTS services ( name TEXT PRIMARY KEY, @@ -164,8 +165,8 @@ def from_replica_statuses( # === Service functions === def add_or_update_service( - controller_job_id: int, name: str, + controller_job_id: int, controller_port: Optional[int] = None, load_balancer_port: Optional[int] = None, status: ServiceStatus = ServiceStatus.CONTROLLER_INIT, diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index e9e278a4e48..4ed6949c82a 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -29,7 +29,6 @@ import sky -_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}' _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*' _FAILED_TO_FIND_REPLICA_MSG = ( @@ -472,9 +471,11 @@ def terminate_service(service_name: str) -> None: print(f'Service {service_name!r} is scheduled to be terminated.') for _ in range(constants.SERVICE_TERMINATION_TIMEOUT): record = serve_state.get_service_from_name(service_name) + replica_infos = serve_state.get_replica_infos(service_name) if record is None: - return - if record['status'] == serve_state.ServiceStatus.FAILED_CLEANUP: + if not replica_infos: + return + elif record['status'] == serve_state.ServiceStatus.FAILED_CLEANUP: raise RuntimeError( f'Failed to terminate service {service_name!r}. Some ' 'resources are not cleaned up properly. Please SSH to ' @@ -674,7 +675,12 @@ def wait_for_load_balancer_port(service_name: str) -> str: # Sleep for a while to bootstrap the load balancer. time.sleep(5) for _ in range(constants.SERVICE_PORT_SELECTION_TIMEOUT): - latest_info = get_latest_info(service_name) + try: + latest_info = get_latest_info(service_name) + except ValueError: + # Service is not created yet. + time.sleep(1) + continue load_balancer_port = latest_info['load_balancer_port'] if load_balancer_port is not None: return common_utils.encode_payload(load_balancer_port) @@ -699,13 +705,6 @@ class ServeCodeGen: 'from sky.serve import serve_utils', ] - @classmethod - def add_service(cls, job_id: int, service_name: str) -> str: - code = [ - f'serve_state.add_or_update_service({job_id}, {service_name!r})', - ] - return cls._build(code) - @classmethod def get_latest_info(cls, service_name: str) -> str: code = [ diff --git a/sky/serve/service.py b/sky/serve/service.py index 038d1bfdad7..2278ee68fc2 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -95,7 +95,7 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: return failed -def _start(service_name: str, task_yaml: str): +def _start(service_name: str, task_yaml: str, job_id: int): """Starts the service.""" # Generate log file name. load_balancer_log_file = os.path.expanduser( @@ -110,21 +110,19 @@ def _start(service_name: str, task_yaml: str): # are executed at the same time. authentication.get_or_generate_keys() - # Store service information in the serve state. + # Initialize database record for the service. service_spec = serve.SkyServiceSpec.from_yaml(task_yaml) - record = serve_state.get_service_from_name(service_name) - if record is None: - raise ValueError(f'Service {service_name} does not exist.') - record['policy'] = service_spec.policy_str() - record['auto_restart'] = service_spec.auto_restart with open(task_yaml, 'r') as f: config = yaml.safe_load(f) resources_config = None if isinstance(config, dict): resources_config = config.get('resources') requested_resources = resources.Resources.from_yaml_config(resources_config) - record['requested_resources'] = requested_resources - serve_state.add_or_update_service(**record) + serve_state.add_or_update_service(service_name, + job_id, + policy=service_spec.policy_str(), + auto_restart=service_spec.auto_restart, + requested_resources=requested_resources) controller_process = None load_balancer_process = None @@ -181,9 +179,11 @@ def _start(service_name: str, task_yaml: str): if failed: serve_state.set_service_status( service_name, serve_state.ServiceStatus.FAILED_CLEANUP) + logger.error(f'Service {service_name} failed to clean up.') else: shutil.rmtree(service_dir) serve_state.remove_service(service_name) + logger.info(f'Service {service_name} terminated successfully.') if __name__ == '__main__': @@ -196,8 +196,12 @@ def _start(service_name: str, task_yaml: str): type=str, help='Task YAML file', required=True) + parser.add_argument('--job-id', + required=True, + type=int, + help='Job id for the service job.') args = parser.parse_args() # We start process with 'spawn', because 'fork' could result in weird # behaviors; 'spawn' is also cross-platform. multiprocessing.set_start_method('spawn', force=True) - _start(args.service_name, args.task_yaml) + _start(args.service_name, args.task_yaml, args.job_id) diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index 3ab2e4b6e0a..f5562a8bd49 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -20,6 +20,7 @@ run: | python -u -m sky.serve.service \ --service-name {{service_name}} \ --task-yaml {{remote_task_yaml_path}} \ + --job-id $SKYPILOT_INTERNAL_JOB_ID \ > {{controller_log_file}} 2>&1 envs: From 2984996c5d44196d542f6fb9947d9370dbac8c49 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 16 Oct 2023 11:15:23 -0700 Subject: [PATCH 128/223] refactor dataabse --- sky/backends/backend_utils.py | 7 +- sky/backends/cloud_vm_ray_backend.py | 9 +-- sky/cli.py | 2 +- sky/core.py | 27 ++++--- sky/execution.py | 45 ++++------- sky/global_user_state.py | 41 +++++----- sky/serve/__init__.py | 2 - sky/serve/constants.py | 1 - sky/serve/serve_state.py | 32 ++++---- sky/serve/serve_utils.py | 111 +++++++++------------------ sky/serve/service.py | 10 +-- sky/task.py | 2 +- sky/utils/cli_utils/status_utils.py | 15 +--- 13 files changed, 116 insertions(+), 188 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index dd76ccff1fb..7b0da2b9ffb 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2721,7 +2721,7 @@ def _add_default_value_to_local_record( 'status': serve_lib.ServiceStatus.UNKNOWN, 'controller_port': None, 'load_balancer_port': None, - 'policy': '', + 'policy': None, 'auto_restart': False, 'requested_resources': sky.Resources(), }) @@ -2749,8 +2749,7 @@ def _refresh_service_record_no_lock( except exceptions.NetworkError: return record, 'Failed to refresh replica info due to network error.' - service_handle: serve_lib.ServiceHandle = record['handle'] - if not service_handle.endpoint: + if not record['endpoint']: # Service controller is still initializing. Skipped refresh status. return record, None @@ -2934,7 +2933,7 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: # We set CPU resource for sky serve controller to a smaller value # to support a larger number of services. 'CPU': (serve_lib.SERVICES_TASK_CPU_DEMAND - if task.service_handle is not None else DEFAULT_TASK_CPU_DEMAND) + if task.service_name is not None else DEFAULT_TASK_CPU_DEMAND) } if task.best_resources is not None: resources = task.best_resources diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 9568bba3db7..4b42d137429 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3822,19 +3822,18 @@ def tail_spot_logs(self, ) def tail_serve_logs(self, handle: CloudVmRayResourceHandle, - service_handle: serve_lib.ServiceHandle, - target: serve_lib.ServiceComponent, + service_name: str, target: serve_lib.ServiceComponent, replica_id: Optional[int], follow: bool) -> None: if target != serve_lib.ServiceComponent.REPLICA: code = serve_lib.ServeCodeGen.stream_serve_process_logs( - service_handle.service_name, + service_name, stream_controller=( target == serve_lib.ServiceComponent.CONTROLLER), follow=follow) else: - assert replica_id is not None, service_handle + assert replica_id is not None, service_name code = serve_lib.ServeCodeGen.stream_replica_logs( - service_handle.service_name, replica_id, follow) + service_name, replica_id, follow) signal.signal(signal.SIGINT, backend_utils.interrupt_handler) signal.signal(signal.SIGTSTP, backend_utils.stop_handler) diff --git a/sky/cli.py b/sky/cli.py index 5fa51196cec..47a4a6cc616 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4253,7 +4253,7 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): if record is None: with ux_utils.print_exception_no_traceback(): raise ValueError(f'\nService {service_name!r} not found.') - service_endpoint = status_utils.get_endpoint(record) + service_endpoint = record['endpoint'] if service_endpoint is None: with ux_utils.print_exception_no_traceback(): raise ValueError( diff --git a/sky/core.py b/sky/core.py index 6ea5843d306..c7031fadcdf 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1010,9 +1010,17 @@ def serve_status( 'launched_at': (int) timestamp of creation, 'controller_name': (str) name of the controller cluster of the service, - 'handle': (serve.ServiceHandle) handle of the service, - 'status': (sky.ServiceStatus) service status, + 'endpoint': (str) service endpoint, 'replica_info': (List[Dict[str, Any]]) replica information, + 'uptime': (int) uptime in seconds, + 'status': (sky.ServiceStatus) service status, + 'controller_port': (Optional[int]) controller port, + 'load_balancer_port': (Optional[int]) load balancer port, + 'policy': (Optional[str]) load balancer policy description, + 'auto_restart': (bool) whether the service replcia will be + auto-restarted, + 'requested_resources': (sky.Resources) requested resources + for replica, } Each entry in replica_info has the following fields: @@ -1097,13 +1105,12 @@ def serve_tail_logs( with ux_utils.print_exception_no_traceback(): raise ValueError('`replica_id` must be None when using ' 'target=CONTROLLER/LOAD_BALANCER.') - service_record = global_user_state.get_service_from_name(service_name) - if service_record is None: + controller_name = global_user_state.get_service_controller_name( + service_name) + if controller_name is None: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service {service_name!r} does not exist. ' 'Cannot stream logs.') - service_handle: serve.ServiceHandle = service_record['handle'] - controller_name = service_record['controller_name'] controller_status, handle = backend_utils.refresh_cluster_status_handle( controller_name) if controller_status is None: @@ -1119,7 +1126,7 @@ def serve_tail_logs( backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend), backend backend.tail_serve_logs(handle, - service_handle, + service_name, target, replica_id, follow=follow) @@ -1139,13 +1146,13 @@ def serve_down(service_name: str, purge: bool = False) -> None: ValueError: if the service does not exist. RuntimeError: if failed to terminate the service. """ - service_record = global_user_state.get_service_from_name(service_name) + controller_name = global_user_state.get_service_controller_name( + service_name) - if service_record is None: + if controller_name is None: with ux_utils.print_exception_no_traceback(): raise ValueError(f'Service {service_name!r} not found.') - controller_name = service_record['controller_name'] handle = global_user_state.get_handle_from_cluster_name(controller_name) controller_fetch_ip_error_message = ( diff --git a/sky/execution.py b/sky/execution.py index c9393e74df3..fdd37891758 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1027,32 +1027,17 @@ def serve_up( # name. # In the same time, generate ports for the controller and load balancer. # Use file lock to make sure the ports are unique to each service. - try: - with filelock.FileLock( - os.path.expanduser(serve.CONTROLLER_FILE_LOCK_PATH), - serve.CONTROLLER_FILE_LOCK_TIMEOUT): - controller_name, _ = serve.get_available_controller_name() - - service_handle = serve.ServiceHandle( - service_name=service_name, - requested_controller_resources=controller_resources) - - global_user_state.add_or_update_service( - service_name, - launched_at=int(time.time()), - controller_name=controller_name, - handle=service_handle) - # TODO(tian): Probably run another sky.launch after we get - # the load balancer port from the controller? So we don't - # need to open so many ports here. - controller_resources = controller_resources.copy( - ports=[serve.LOAD_BALANCER_PORT_RANGE]) - except filelock.Timeout as e: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'Timeout when obtaining controller lock for service ' - f'{service_name!r}. Please check if there are some ' - '`sky serve up` process hanging abnormally.') from e + with filelock.FileLock(os.path.expanduser(serve.CONTROLLER_FILE_LOCK_PATH)): + controller_name = serve.get_available_controller_name() + + global_user_state.add_service(service_name, + launched_at=int(time.time()), + controller_name=controller_name) + # TODO(tian): Probably run another sky.launch after we get + # the load balancer port from the controller? So we don't + # need to open so many ports here. + controller_resources = controller_resources.copy( + ports=[serve.LOAD_BALANCER_PORT_RANGE]) _maybe_translate_local_file_mounts_and_sync_up(task, prefix='serve') @@ -1084,7 +1069,7 @@ def serve_up( # Set this to modify default ray task CPU usage to custom value # instead of default 0.5 vCPU. We need to set it to a smaller value # to support a larger number of services. - controller_task.service_handle = service_handle + controller_task.service_name = service_name fore = colorama.Fore style = colorama.Style @@ -1124,8 +1109,8 @@ def serve_up( f'{service_name!r}.'), stderr) load_balancer_port = serve.decode_load_balancer_port( lb_port_payload) - service_handle.endpoint = f'{handle.head_ip}:{load_balancer_port}' - global_user_state.set_service_handle(service_name, service_handle) + endpoint = f'{handle.head_ip}:{load_balancer_port}' + global_user_state.set_service_endpoint(service_name, endpoint) print(f'{fore.GREEN}Launching controller for {service_name!r}...done.' f'{style.RESET_ALL}') @@ -1151,7 +1136,7 @@ def serve_up( f'{backend_utils.RESET_BOLD} to get all valid REPLICA_ID)') print(f'\n{style.BRIGHT}{fore.CYAN}Endpoint URL: ' f'{style.RESET_ALL}{fore.CYAN}' - f'{service_handle.endpoint}{style.RESET_ALL}') + f'{endpoint}{style.RESET_ALL}') print(f'{fore.GREEN}Starting replicas now...{style.RESET_ALL}') print('\nTo monitor replica status:' f'\t{backend_utils.BOLD}watch -n10 sky serve status ' diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 09dc6cc07b8..b4316746df1 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -99,7 +99,7 @@ def create_table(cursor, conn): name TEXT PRIMARY KEY, launched_at INTEGER, controller_name TEXT, - handle BLOB)""") + endpoint TEXT DEFAULT NULL)""") # For backward compatibility. # TODO(zhwu): Remove this function after all users have migrated to # the latest version of SkyPilot. @@ -280,19 +280,16 @@ def add_or_update_cluster(cluster_name: str, _DB.conn.commit() -def add_or_update_service(name: str, launched_at: int, controller_name: str, - handle: 'serve.ServiceHandle') -> None: +def add_service(name: str, launched_at: int, controller_name: str) -> None: _DB.cursor.execute( - 'INSERT or REPLACE INTO services' - '(name, launched_at, controller_name, handle) ' + 'INSERT INTO services' + '(name, launched_at, controller_name) ' 'VALUES (' # name '?, ' # launched_at '?, ' # controller_name - '?, ' - # handle '?' ')', ( @@ -302,8 +299,6 @@ def add_or_update_service(name: str, launched_at: int, controller_name: str, launched_at, # controller_name controller_name, - # handle - pickle.dumps(handle), )) _DB.conn.commit() @@ -355,9 +350,9 @@ def remove_service(service_name: str): _DB.conn.commit() -def set_service_handle(service_name: str, handle: 'serve.ServiceHandle'): - _DB.cursor.execute('UPDATE services SET handle=(?) ' - 'WHERE name=(?)', (pickle.dumps(handle), service_name)) +def set_service_endpoint(service_name: str, endpoint: str): + _DB.cursor.execute('UPDATE services SET endpoint=(?) ' + 'WHERE name=(?)', (endpoint, service_name)) count = _DB.cursor.rowcount _DB.conn.commit() assert count <= 1, count @@ -597,13 +592,13 @@ def _get_service_from_row(row) -> Dict[str, Any]: # Explicitly specify the number of fields to unpack, so that # we can add new fields to the database in the future without # breaking the previous code. - name, launched_at, controller_name, handle = row[:4] + name, launched_at, controller_name, endpoint = row[:4] # TODO: use namedtuple instead of dict return { 'name': name, 'launched_at': launched_at, 'controller_name': controller_name, - 'handle': pickle.loads(handle), + 'endpoint': endpoint, } @@ -616,6 +611,15 @@ def get_service_from_name( return None +def get_service_controller_name(service_name: Optional[str]) -> Optional[str]: + rows = _DB.cursor.execute( + 'SELECT controller_name FROM services WHERE name=(?)', + (service_name,)).fetchall() + for (controller_name,) in rows: + return controller_name + return None + + def get_services_from_controller_name( controller_name: str) -> List[Dict[str, Any]]: rows = _DB.cursor.execute( @@ -628,15 +632,6 @@ def get_services_from_controller_name( return records -def get_handle_from_service_name( - service_name: Optional[str]) -> Optional['serve.ServiceHandle']: - rows = _DB.cursor.execute('SELECT handle FROM services WHERE name=(?)', - (service_name,)).fetchall() - for (handle,) in rows: - return pickle.loads(handle) - return None - - def get_clusters() -> List[Dict[str, Any]]: rows = _DB.cursor.execute( 'select * from clusters order by launched_at desc').fetchall() diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index ea177ceee76..0544ff53fa9 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -2,7 +2,6 @@ import os from sky.serve.constants import CONTROLLER_FILE_LOCK_PATH -from sky.serve.constants import CONTROLLER_FILE_LOCK_TIMEOUT from sky.serve.constants import CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP from sky.serve.constants import CONTROLLER_PREFIX from sky.serve.constants import CONTROLLER_RESOURCES @@ -23,7 +22,6 @@ from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent -from sky.serve.serve_utils import ServiceHandle from sky.serve.service_spec import SkyServiceSpec os.makedirs(os.path.expanduser(SERVE_PREFIX), exist_ok=True) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 59a9e2f7830..dde3be40c8a 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -15,7 +15,6 @@ # service. In our current multi-service controller implementation, we need to # select a controller if there are some existing controllers. CONTROLLER_FILE_LOCK_PATH = f'{SERVE_PREFIX}/controller.lock' -CONTROLLER_FILE_LOCK_TIMEOUT = 20 # The filelock for selecting service ports when starting a service. We need to # have a filelock to avoid port collision when starting multiple services at diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 818bf20b6e1..607add9f9fc 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -26,13 +26,12 @@ _CONN = sqlite3.connect(_DB_PATH) _CURSOR = _CONN.cursor() -# TODO(tian): Probably change back to ServiceHandle... _CURSOR.execute("""\ CREATE TABLE IF NOT EXISTS services ( name TEXT PRIMARY KEY, controller_job_id INTEGER, - controller_port INTEGER, - load_balancer_port INTEGER, + controller_port INTEGER DEFAULT NULL, + load_balancer_port INTEGER DEFAULT NULL, status TEXT, uptime INTEGER DEFAULT NULL, policy TEXT, @@ -164,26 +163,21 @@ def from_replica_statuses( # === Service functions === -def add_or_update_service( - name: str, - controller_job_id: int, - controller_port: Optional[int] = None, - load_balancer_port: Optional[int] = None, - status: ServiceStatus = ServiceStatus.CONTROLLER_INIT, - uptime: Optional[int] = None, - policy: Optional[str] = None, - auto_restart: bool = False, - requested_resources: Optional['sky.Resources'] = None) -> None: +def add_service(name: str, + controller_job_id: int, + policy: str, + auto_restart: bool, + requested_resources: 'sky.Resources', + status: ServiceStatus = ServiceStatus.CONTROLLER_INIT) -> None: """Adds a service to the database.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( """\ - INSERT OR REPLACE INTO services - (name, controller_job_id, controller_port, load_balancer_port, - status, uptime, policy, auto_restart, requested_resources) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", - (name, controller_job_id, controller_port, load_balancer_port, - status.value, uptime, policy, int(auto_restart), + INSERT INTO services + (name, controller_job_id, status, policy, + auto_restart, requested_resources) + VALUES (?, ?, ?, ?, ?, ?)""", + (name, controller_job_id, status.value, policy, int(auto_restart), pickle.dumps(requested_resources))) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 4ed6949c82a..e60e13858d5 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -10,7 +10,7 @@ import time import typing from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, Set, - TextIO, Tuple, Type, TypeVar) + TextIO, Type, TypeVar) import colorama import filelock @@ -180,9 +180,9 @@ def _get_existing_controller_names() -> Set[str]: There is two possible indicators for a controller: 1. It is in the cluster database, which means it is already created; - 2. It is in the service database, which means it will be created - later in the future. This usually happens when multiple `sky serve up` - are running simultaneously. + 2. It is not in the cluster database but in the service database, + which means it will be created later in the future. This usually + happens when multiple `sky serve up` are running simultaneously. Returns: A set of existing sky serve controller names. @@ -277,32 +277,25 @@ def _get_service_slot_on_controller(controller_name: str) -> int: Returns: Number of slots on the controller. """ - memory_requirements = 0. - controller_record = global_user_state.get_cluster_from_name(controller_name) - if controller_record is not None: - # If controller is already created, use its launched resources. - handle = controller_record['handle'] - assert isinstance(handle, backends.CloudVmRayResourceHandle) - # Determine max number of services on this controller. - controller_cloud = handle.launched_resources.cloud - _, memory_requirements = ( - controller_cloud.get_vcpus_mem_from_instance_type( - handle.launched_resources.instance_type)) - else: - # Corner case: Multiple `sky serve up` are running simultaneously - # and the controller is not created yet. We created a resources - # for each initializing controller, and use the minimal memory - # requirement among them, since any of them could be the first to - # launch the controller. - service_records = (global_user_state.get_services_from_controller_name( - controller_name)) - for service_record in service_records: - r = service_record['handle'].requested_controller_resources - # Remove the '+' in memory requirement. - memory_requirements = min(memory_requirements, - float(r.memory.strip('+'))) + controller_memory = 0. + # Wait for the controller to be created. This could happen if multiple + # `sky serve up` are running simultaneously. + while True: + controller_record = global_user_state.get_cluster_from_name( + controller_name) + if controller_record is not None: + handle = controller_record['handle'] + assert isinstance(handle, backends.CloudVmRayResourceHandle) + # Determine max number of services on this controller. + controller_cloud = handle.launched_resources.cloud + _, controller_memory = ( + controller_cloud.get_vcpus_mem_from_instance_type( + handle.launched_resources.instance_type)) + assert controller_memory is not None + break + time.sleep(5) # Determine max number of services on this controller. - max_services_num = int(memory_requirements / + max_services_num = int(controller_memory / constants.SERVICES_MEMORY_USAGE_GB) # Get current number of services on this controller. services_num_on_controller = len( @@ -310,7 +303,7 @@ def _get_service_slot_on_controller(controller_name: str) -> int: return max_services_num - services_num_on_controller -def get_available_controller_name() -> Tuple[str, bool]: +def get_available_controller_name() -> str: """Get available controller name to use. Only consider controllers that have available slots for services. @@ -321,8 +314,7 @@ def get_available_controller_name() -> Tuple[str, bool]: `sky serve up` select the same last slot on a controller. Returns: - A tuple of controller name and a boolean value indicating whether the - controller name is newly generated. + Controller name to use. """ # Get all existing controllers. existing_controllers = _get_existing_controller_names() @@ -330,15 +322,15 @@ def get_available_controller_name() -> Tuple[str, bool]: # Get a mapping from controller name to number of services on it. for controller_name in existing_controllers: num_slots = _get_service_slot_on_controller(controller_name) - # Only consider controllers that have available slots for services. + # Only consider controllers that have available slot for services. if num_slots > 0: controller2slots[controller_name] = num_slots if not controller2slots: - return generate_controller_cluster_name(existing_controllers), True + return generate_controller_cluster_name(existing_controllers) # If multiple controllers are available, choose the one with least number of - # slots, i.e. most number of services. - return min(controller2slots.keys(), - key=lambda k: controller2slots[k]), False + # slots, i.e. most number of services. This helps to decrease the number of + # controllers. + return min(controller2slots.keys(), key=lambda k: controller2slots[k]) def set_service_status_from_replica_statuses( @@ -372,41 +364,6 @@ def update_service_status() -> None: record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED) -class ServiceHandle(object): - """A pickle-able tuple of: - - - (required) Service name. - - (required) Service requested controller resources. - - (optional) Service endpoint IP. - - This class is only used as a cache for information fetched from controller. - """ - _VERSION = 0 - - def __init__( - self, - *, - service_name: str, - requested_controller_resources: 'sky.Resources', - endpoint: Optional[str] = None, - ) -> None: - self._version = self._VERSION - self.service_name = service_name - self.requested_controller_resources = requested_controller_resources - self.endpoint = endpoint - - def __repr__(self) -> str: - return ('ServiceHandle(' - f'\n\tservice_name={self.service_name},' - '\n\trequested_controller_resources=' - f'{self.requested_controller_resources},' - f'\n\tendpoint={self.endpoint})') - - def __setstate__(self, state): - self._version = self._VERSION - self.__dict__.update(state) - - def get_replica_info(service_name: str, with_handle: bool) -> List[Dict[str, Any]]: """Get the information of all replicas of the service. @@ -424,11 +381,13 @@ def get_replica_info(service_name: str, ] -def get_latest_info(service_name: str) -> Dict[str, Any]: +def get_latest_info(service_name: str, + with_replica_info: bool = True) -> Dict[str, Any]: """Get the latest information of the service. Args: service_name: The name of the service. + with_replica_info: Whether to include the information of all replicas. Returns: A dictionary of latest information of the service. @@ -438,7 +397,9 @@ def get_latest_info(service_name: str) -> Dict[str, Any]: record = serve_state.get_service_from_name(service_name) if record is None: raise ValueError(f'Service {service_name!r} does not exist.') - record['replica_info'] = get_replica_info(service_name, with_handle=True) + if with_replica_info: + record['replica_info'] = get_replica_info(service_name, + with_handle=True) return record @@ -676,7 +637,7 @@ def wait_for_load_balancer_port(service_name: str) -> str: time.sleep(5) for _ in range(constants.SERVICE_PORT_SELECTION_TIMEOUT): try: - latest_info = get_latest_info(service_name) + latest_info = get_latest_info(service_name, with_replica_info=False) except ValueError: # Service is not created yet. time.sleep(1) diff --git a/sky/serve/service.py b/sky/serve/service.py index 2278ee68fc2..a1eb4ee14fb 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -118,11 +118,11 @@ def _start(service_name: str, task_yaml: str, job_id: int): if isinstance(config, dict): resources_config = config.get('resources') requested_resources = resources.Resources.from_yaml_config(resources_config) - serve_state.add_or_update_service(service_name, - job_id, - policy=service_spec.policy_str(), - auto_restart=service_spec.auto_restart, - requested_resources=requested_resources) + serve_state.add_service(service_name, + controller_job_id=job_id, + policy=service_spec.policy_str(), + auto_restart=service_spec.auto_restart, + requested_resources=requested_resources) controller_process = None load_balancer_process = None diff --git a/sky/task.py b/sky/task.py index 98cb13ce855..2ee4d08ac1c 100644 --- a/sky/task.py +++ b/sky/task.py @@ -265,7 +265,7 @@ def __init__( self.spot_dag: Optional['sky.Dag'] = None # Only set when 'self' is a sky serve controller task. - self.service_handle: Optional['serve_lib.ServiceHandle'] = None + self.service_name: Optional[str] = None # Filled in by the optimizer. If None, this Task is not planned. self.best_resources = None diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 1ec0a2d9a4b..be940da869d 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -127,7 +127,7 @@ def show_service_table(service_records: List[_ServiceRecord], show_all: bool): StatusColumn('CONTROLLER_NAME', _get_controller_name, show_by_default=False), - StatusColumn('ENDPOINT', _get_display_endpoint), + StatusColumn('ENDPOINT', _get_endpoint), StatusColumn('POLICY', _get_policy, show_by_default=False), StatusColumn('REQUESTED_RESOURCES', _get_requested_resources, @@ -397,10 +397,6 @@ def show_local_status_table(local_clusters: List[str]): 'requested_resources'] -def _get_service_handle(service_record: _ServiceRecord) -> serve.ServiceHandle: - return service_record['handle'] - - def _get_uptime(service_record: _ServiceRecord) -> str: uptime = service_record['uptime'] if uptime is None: @@ -420,13 +416,8 @@ def _get_replicas(service_record: _ServiceRecord) -> str: return f'{ready_replica_num}/{total_replica_num}' -def get_endpoint(service_record: _ServiceRecord) -> Optional[str]: - handle = _get_service_handle(service_record) - return handle.endpoint - - -def _get_display_endpoint(service_record: _ServiceRecord) -> str: - endpoint = get_endpoint(service_record) +def _get_endpoint(service_record: _ServiceRecord) -> str: + endpoint = service_record['endpoint'] if endpoint is None: return '-' return endpoint From cf721a22a458370f2792afd0b67e1b2c89f7afbc Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 16 Oct 2023 11:45:49 -0700 Subject: [PATCH 129/223] comments, docs and function reorder for infra providers --- sky/serve/controller.py | 5 +- sky/serve/infra_providers.py | 339 ++++++++++++++++++----------------- sky/serve/load_balancer.py | 6 +- 3 files changed, 186 insertions(+), 164 deletions(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 8ee2ac5c51b..dde8f6eb01c 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -97,7 +97,10 @@ def load_balancer_sync(request: fastapi.Request): 'serve_utils.RequestTimestamp for ' 'RequestRateAutoscaler.') self.autoscaler.update_request_information(request_information) - return {'ready_replicas': self.infra_provider.get_ready_replicas()} + return { + 'ready_replica_ips': + self.infra_provider.get_ready_replica_ips() + } threading.Thread(target=self._run_autoscaler).start() diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 0e32deee0f9..036a44d88fc 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -4,15 +4,13 @@ import enum import functools import os -import signal import subprocess import threading import time import traceback import typing -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple -import psutil import requests import sky @@ -61,19 +59,6 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None: logger.error(f' Traceback: {traceback.format_exc()}') -def _interrupt_process_and_children(pid: int) -> None: - parent_process = psutil.Process(pid) - for child_process in parent_process.children(recursive=True): - try: - child_process.send_signal(signal.SIGINT) - except psutil.NoSuchProcess: - pass - try: - parent_process.send_signal(signal.SIGINT) - except psutil.NoSuchProcess: - pass - - def with_lock(func): @functools.wraps(func) @@ -99,21 +84,22 @@ class ProcessStatus(enum.Enum): @dataclasses.dataclass class ReplicaStatusProperty: - """Some properties that determine replica status.""" - # Process status of sky.launch + """Some properties that determine replica status. + + Attributes: + sky_launch_status: Process status of sky.launch. + user_app_failed: Whether the service job failed. + service_ready_now: Latest readiness probe result. + service_once_ready: Whether the service has been ready at least once. + sky_down_status: Process status of sky.down. + """ # Initial value is RUNNING since each `ReplicaInfo` is created # when `sky.launch` is called. sky_launch_status: ProcessStatus = ProcessStatus.RUNNING - # User job status in [FAILED, FAILED_SETUP] user_app_failed: bool = False - # Latest readiness probe result service_ready_now: bool = False - # Whether the service has been ready at least once - # If service was not ready before, we count how long it takes to startup - # and compare it with the initial delay seconds; otherwise, we count how - # many consecutive failures it has. service_once_ready: bool = False - # Process status of sky.down. None means sky.down is not called yet. + # None means sky.down is not called yet. sky_down_status: Optional[ProcessStatus] = None def is_scale_down_succeeded(self) -> bool: @@ -184,7 +170,6 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: return serve_state.ReplicaStatus.STARTING -# TODO(tian): Maybe rename it to Replica class ReplicaInfo: """Replica info for each replica.""" @@ -231,9 +216,11 @@ def to_info_dict(self, with_handle: bool) -> Dict[str, Any]: return info_dict def probe( - self, readiness_suffix: str, post_data: Optional[Union[str, Dict[str, - Any]]] + self, + readiness_suffix: str, + post_data: Optional[Dict[str, Any]], ) -> Tuple['ReplicaInfo', bool]: + """Probe the readiness of the replica.""" replica_ip = self.ip try: msg = '' @@ -277,20 +264,22 @@ def __init__(self, service_name: str, self.service_name: str = service_name self.readiness_suffix: str = spec.readiness_suffix self.initial_delay_seconds: int = spec.initial_delay_seconds - self.post_data: Optional[Union[str, Dict[str, Any]]] = spec.post_data + self.post_data: Optional[Dict[str, Any]] = spec.post_data self.uptime: Optional[float] = None logger.info(f'Readiness probe suffix: {self.readiness_suffix}') logger.info(f'Initial delay seconds: {self.initial_delay_seconds}') - logger.info(f'Post data: {self.post_data} ({type(self.post_data)})') + logger.info(f'Post data: {self.post_data}') - def get_ready_replicas(self) -> Set[str]: - # Returns the endpoints of all ready replicas + def get_ready_replica_ips(self) -> Set[str]: + """Get all ready replica's IP addresses.""" raise NotImplementedError def scale_up(self, n: int) -> None: + """Scale up the service by n replicas.""" raise NotImplementedError def scale_down(self, replica_ids: List[int]) -> None: + """Scale down all replicas in replica_ids.""" raise NotImplementedError @@ -310,11 +299,120 @@ def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', threading.Thread(target=self._job_status_fetcher).start() threading.Thread(target=self._replica_prober).start() - # This process periodically checks all sky.launch and sky.down process - # on the fly. If any of them finished, it will update the status of - # the corresponding replica. + ################################ + # Replica management functions # + ################################ + + def get_ready_replica_ips(self) -> Set[str]: + ready_replicas = set() + infos = serve_state.get_replica_infos(self.service_name) + for info in infos: + if info.status == serve_state.ReplicaStatus.READY: + assert info.ip is not None + ready_replicas.add(info.ip) + return ready_replicas + + def _launch_replica(self, replica_id: int) -> None: + cluster_name = serve_utils.generate_replica_cluster_name( + self.service_name, replica_id) + if replica_id in self.launch_process_pool: + logger.warning(f'Launch process for replica {replica_id} ' + 'already exists. Skipping.') + return + logger.info(f'Creating replica {replica_id}') + # TODO(tian): We should do usage_lib.messages.usage.set_internal() + # after we change to python API. + cmd = ['sky', 'launch', self.task_yaml_path, '-c', cluster_name, '-y'] + cmd.extend(['--detach-setup', '--detach-run', '--retry-until-up']) + log_file_name = serve_utils.generate_replica_launch_log_file_name( + self.service_name, replica_id) + with open(log_file_name, 'w') as f: + # pylint: disable=consider-using-with + p = subprocess.Popen(cmd, + stdin=subprocess.DEVNULL, + stdout=f, + stderr=f) + self.launch_process_pool[replica_id] = p + info = ReplicaInfo(replica_id, cluster_name) + serve_state.add_or_update_replica(self.service_name, replica_id, info) + + def scale_up(self, n: int) -> None: + for _ in range(n): + self._launch_replica(self.next_replica_id) + self.next_replica_id += 1 + + def _teardown_replica(self, replica_id: int, sync_down_logs: bool) -> None: + if replica_id in self.down_process_pool: + logger.warning(f'Down process for replica {replica_id} already ' + 'exists. Skipping.') + return + + def _sync_down_logs(): + info = serve_state.get_replica_info_from_id(self.service_name, + replica_id) + if info is None: + logger.error(f'Cannot find replica {replica_id} in the ' + 'replica table. Skipping syncing down logs.') + return + logger.info(f'Syncing down logs for replica {replica_id}...') + backend = backends.CloudVmRayBackend() + handle = global_user_state.get_handle_from_cluster_name( + info.cluster_name) + if handle is None: + logger.error(f'Cannot find cluster {info.cluster_name} ' + 'in the cluster table. Skipping syncing ' + 'down logs.') + return + replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, + 'replica_jobs') + log_file = backend_utils.download_and_stream_latest_job_log( + backend, + handle, + replica_job_logs_dir, + log_position_hint='replica cluster', + log_finish_hint=f'Replica: {replica_id}') + if log_file is not None: + local_log_file_name = ( + serve_utils.generate_replica_local_log_file_name( + self.service_name, replica_id)) + os.rename(log_file, local_log_file_name) + + if sync_down_logs: + _sync_down_logs() + + logger.info(f'Deleting replica {replica_id}') + info = serve_state.get_replica_info_from_id(self.service_name, + replica_id) + assert info is not None + cmd = ['sky', 'down', info.cluster_name, '-y'] + log_file_name = serve_utils.generate_replica_down_log_file_name( + self.service_name, replica_id) + with open(log_file_name, 'w') as f: + # pylint: disable=consider-using-with + p = subprocess.Popen(cmd, + stdin=subprocess.DEVNULL, + stdout=f, + stderr=f) + self.down_process_pool[replica_id] = p + info.status_property.sky_down_status = ProcessStatus.RUNNING + serve_state.add_or_update_replica(self.service_name, replica_id, info) + + def scale_down(self, replica_ids: List[int]) -> None: + for replica_id in replica_ids: + self._teardown_replica(replica_id, sync_down_logs=False) + + ################################ + # InfraProvider Daemon Threads # + ################################ + @with_lock def _refresh_process_pool(self) -> None: + """Refresh the launch/down process pool. + + This function will checks all sky.launch and sky.down process on + the fly. If any of them finished, it will update the status of the + corresponding replica. + """ for replica_id, p in list(self.launch_process_pool.items()): if p.poll() is not None: # TODO(tian): Try-catch in process, and have an enum return @@ -375,6 +473,7 @@ def _refresh_process_pool(self) -> None: replica_id, info) def _process_pool_refresher(self) -> None: + """Periodically refresh the launch/down process pool.""" while True: logger.info('Refreshing process pool.') try: @@ -387,6 +486,12 @@ def _process_pool_refresher(self) -> None: @with_lock def _fetch_job_status(self) -> None: + """Fetch the service job status of all replicas. + + This function will monitor the job status of all replicas + to make sure the service is running correctly. If any of the + replicas failed, it will terminate the replica. + """ infos = serve_state.get_replica_infos(self.service_name) for info in infos: if not info.status_property.should_track_status(): @@ -408,11 +513,12 @@ def _fetch_job_status(self) -> None: serve_state.add_or_update_replica(self.service_name, info.replica_id, info) logger.warning( - f'User APP for replica {info.replica_id} FAILED. ' + f'Service job for replica {info.replica_id} FAILED. ' 'Terminating...') self._teardown_replica(info.replica_id, sync_down_logs=True) def _job_status_fetcher(self) -> None: + """Periodically fetch the service job status of all replicas.""" while True: logger.info('Refreshing job status.') try: @@ -423,125 +529,16 @@ def _job_status_fetcher(self) -> None: logger.error(f'Error in job status fetcher: {e}') time.sleep(_JOB_STATUS_FETCH_INTERVAL) - def get_ready_replicas(self) -> Set[str]: - ready_replicas = set() - infos = serve_state.get_replica_infos(self.service_name) - for info in infos: - if info.status == serve_state.ReplicaStatus.READY: - assert info.ip is not None - ready_replicas.add(info.ip) - return ready_replicas - - def _launch_replica(self, replica_id: int) -> None: - cluster_name = serve_utils.generate_replica_cluster_name( - self.service_name, replica_id) - if replica_id in self.launch_process_pool: - logger.warning(f'Launch process for replica {replica_id} ' - 'already exists. Skipping.') - return - logger.info(f'Creating replica {replica_id}') - # TODO(tian): We should do usage_lib.messages.usage.set_internal() - # after we change to python API. - cmd = ['sky', 'launch', self.task_yaml_path, '-c', cluster_name, '-y'] - cmd.extend(['--detach-setup', '--detach-run', '--retry-until-up']) - log_file_name = serve_utils.generate_replica_launch_log_file_name( - self.service_name, replica_id) - with open(log_file_name, 'w') as f: - # pylint: disable=consider-using-with - p = subprocess.Popen(cmd, - stdin=subprocess.DEVNULL, - stdout=f, - stderr=f) - self.launch_process_pool[replica_id] = p - info = ReplicaInfo(replica_id, cluster_name) - serve_state.add_or_update_replica(self.service_name, replica_id, info) - - def scale_up(self, n: int) -> None: - # Launch n new replicas - for _ in range(n): - self._launch_replica(self.next_replica_id) - self.next_replica_id += 1 - - def _teardown_replica(self, replica_id: int, sync_down_logs: bool) -> None: - if replica_id in self.down_process_pool: - logger.warning(f'Down process for replica {replica_id} already ' - 'exists. Skipping.') - return - - def _sync_down_logs(): - info = serve_state.get_replica_info_from_id(self.service_name, - replica_id) - if info is None: - logger.error(f'Cannot find replica {replica_id} in the ' - 'replica table. Skipping syncing down logs.') - return - logger.info(f'Syncing down logs for replica {replica_id}...') - backend = backends.CloudVmRayBackend() - handle = global_user_state.get_handle_from_cluster_name( - info.cluster_name) - if handle is None: - logger.error(f'Cannot find cluster {info.cluster_name} ' - 'in the cluster table. Skipping syncing ' - 'down logs.') - return - replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, - 'replica_jobs') - log_file = backend_utils.download_and_stream_latest_job_log( - backend, - handle, - replica_job_logs_dir, - log_position_hint='replica cluster', - log_finish_hint=f'Replica: {replica_id}') - if log_file is not None: - local_log_file_name = ( - serve_utils.generate_replica_local_log_file_name( - self.service_name, replica_id)) - os.rename(log_file, local_log_file_name) - - if sync_down_logs: - _sync_down_logs() - - logger.info(f'Deleting replica {replica_id}') - info = serve_state.get_replica_info_from_id(self.service_name, - replica_id) - assert info is not None - cmd = ['sky', 'down', info.cluster_name, '-y'] - log_file_name = serve_utils.generate_replica_down_log_file_name( - self.service_name, replica_id) - with open(log_file_name, 'w') as f: - # pylint: disable=consider-using-with - p = subprocess.Popen(cmd, - stdin=subprocess.DEVNULL, - stdout=f, - stderr=f) - self.down_process_pool[replica_id] = p - info.status_property.sky_down_status = ProcessStatus.RUNNING - serve_state.add_or_update_replica(self.service_name, replica_id, info) - - def scale_down(self, replica_ids: List[int]) -> None: - for replica_id in replica_ids: - self._teardown_replica(replica_id, sync_down_logs=False) - - def _replica_prober(self) -> None: - while True: - logger.info('Running replica prober.') - try: - self._probe_all_replicas() - replica_statuses = [ - info['status'] - for info in serve_utils.get_replica_info(self.service_name, - with_handle=False) - ] - serve_utils.set_service_status_from_replica_statuses( - self.service_name, replica_statuses) - except Exception as e: # pylint: disable=broad-except - # No matter what error happens, we should keep the - # replica prober running. - logger.error(f'Error in replica prober: {e}') - time.sleep(serve_constants.ENDPOINT_PROBE_INTERVAL) - @with_lock def _probe_all_replicas(self) -> None: + """Readiness probe replicas. + + This function will probe all replicas to make sure the service is + ready. It will keep track of: + (1) the initial delay for each replica; + (2) the consecutive failure times. + The replica will be terminated if any of the thresholds exceeded. + """ probe_futures = [] replica_to_probe = [] with futures.ThreadPoolExecutor() as executor: @@ -555,6 +552,10 @@ def _probe_all_replicas(self) -> None: self.post_data)) logger.info(f'Replicas to probe: {replica_to_probe}') + # Since futures.as_completed will return futures in the order of + # completion, we need the info.probe function to return the info + # object as well, so that we could update the info object in the + # same order. for future in futures.as_completed(probe_futures): future_result: Tuple[ReplicaInfo, bool] = future.result() info, probe_succeeded = future_result @@ -563,9 +564,8 @@ def _probe_all_replicas(self) -> None: if probe_succeeded: if self.uptime is None: self.uptime = time.time() - logger.info(f'Replica {info.replica_id} is the first ' - 'ready replica. Setting uptime to ' - f'{self.uptime}.') + logger.info(f'Replica {info.replica_id} is the first ready ' + f'replica. Setting uptime to {self.uptime}.') serve_state.set_service_uptime(self.service_name, int(self.uptime)) info.consecutive_failure_times.clear() @@ -583,14 +583,14 @@ def _probe_all_replicas(self) -> None: if (consecutive_failure_time >= _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT): logger.info( - f'Replica {info.replica_id} is not ready for too ' - 'long and exceeding consecutive failure ' + f'Replica {info.replica_id} is not ready for ' + 'too long and exceeding consecutive failure ' 'threshold. Terminating the replica...') should_teardown = True else: logger.info( - f'Replica {info.replica_id} is not ready but ' - 'within consecutive failure threshold ' + f'Replica {info.replica_id} is not ready ' + 'but within consecutive failure threshold ' f'({consecutive_failure_time}s / ' f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' 'Skipping.') @@ -613,3 +613,22 @@ def _probe_all_replicas(self) -> None: info.replica_id, info) if should_teardown: self._teardown_replica(info.replica_id, sync_down_logs=True) + + def _replica_prober(self) -> None: + """Periodically probe replicas.""" + while True: + logger.info('Running replica prober.') + try: + self._probe_all_replicas() + replica_statuses = [ + info['status'] + for info in serve_utils.get_replica_info(self.service_name, + with_handle=False) + ] + serve_utils.set_service_status_from_replica_statuses( + self.service_name, replica_statuses) + except Exception as e: # pylint: disable=broad-except + # No matter what error happens, we should keep the + # replica prober running. + logger.error(f'Error in replica prober: {e}') + time.sleep(serve_constants.ENDPOINT_PROBE_INTERVAL) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index b82debb18c0..5b7a0cde8cc 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -63,13 +63,13 @@ def _sync_with_controller(self): # Clean up after reporting request information to avoid OOM. self.request_information.clear() response.raise_for_status() - ready_replicas = response.json()['ready_replicas'] + ready_replica_ips = response.json()['ready_replica_ips'] except requests.RequestException as e: print(f'An error occurred: {e}') else: - logger.info(f'Available Replica IPs: {ready_replicas}') + logger.info(f'Available Replica IPs: {ready_replica_ips}') self.load_balancing_policy.set_ready_replicas( - ready_replicas) + ready_replica_ips) time.sleep(constants.CONTROLLER_SYNC_INTERVAL) async def _redirect_handler(self, request: fastapi.Request): From dfcc373b76456c02cd89798c55792892245d9410 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 16 Oct 2023 12:32:55 -0700 Subject: [PATCH 130/223] change launch/terminate replica to python API --- sky/serve/infra_providers.py | 148 +++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 42 deletions(-) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index 036a44d88fc..b6921684491 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -3,8 +3,8 @@ import dataclasses import enum import functools +import multiprocessing import os -import subprocess import threading import time import traceback @@ -15,6 +15,7 @@ import sky from sky import backends +from sky import exceptions from sky import global_user_state from sky import sky_logging from sky.backends import backend_utils @@ -35,28 +36,92 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20 # TODO(tian): Maybe let user determine this threshold _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180 +_RETRY_INIT_GAP_SECONDS = 60 + + +def launch_cluster(task: sky.Task, + cluster_name: str, + max_retry: int = 3) -> None: + """Launch a sky serve replica cluster. + + This function will not wait for the job starts running. It will return + immediately after the job is submitted. + + Raises: + RuntimeError: If failed to launch the cluster after max_retry retries, + or some error happened before provisioning and will happen again + if retry. + """ + retry_cnt = 0 + backoff = common_utils.Backoff(_RETRY_INIT_GAP_SECONDS) + while True: + retry_cnt += 1 + try: + usage_lib.messages.usage.set_internal() + sky.launch(task, + cluster_name, + detach_setup=True, + detach_run=True, + retry_until_up=True) + logger.info(f'Replica cluster {cluster_name} launched.') + except (exceptions.InvalidClusterNameError, + exceptions.NoCloudAccessError, + exceptions.ResourcesMismatchError) as e: + logger.error('Failure happened before provisioning. ' + f'{common_utils.format_exception(e)}') + raise RuntimeError('Failed to launch the sky serve replica ' + f'cluster {cluster_name}.') from e + except exceptions.ResourcesUnavailableError as e: + if not any( + isinstance(err, exceptions.ResourcesUnavailableError) + for err in e.failover_history): + raise RuntimeError('Failed to launch the sky serve replica ' + f'cluster {cluster_name}.') from e + logger.info('Failed to launch the sky serve replica cluster with ' + f'error: {common_utils.format_exception(e)})') + except Exception as e: # pylint: disable=broad-except + logger.info('Failed to launch the sky serve replica cluster with ' + f'error: {common_utils.format_exception(e)})') + logger.info(f' Traceback: {traceback.format_exc()}') + else: # No exception, the launch succeeds. + return + + terminate_cluster(cluster_name) + if retry_cnt >= max_retry: + raise RuntimeError('Failed to launch the sky serve replica cluster ' + f'{cluster_name} after {max_retry} retries.') + gap_seconds = backoff.current_backoff() + logger.info('Retrying to launch the sky serve replica cluster ' + f'in {gap_seconds:.1f} seconds.') + time.sleep(gap_seconds) def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None: """Terminate the sky serve replica cluster.""" retry_cnt = 0 + backoff = common_utils.Backoff() while True: + retry_cnt += 1 try: usage_lib.messages.usage.set_internal() sky.down(cluster_name) return except ValueError: - # The cluster is already down. + # The cluster is already terminated. + logger.info( + f'Replica cluster {cluster_name} is already terminated.') return except Exception as e: # pylint: disable=broad-except - retry_cnt += 1 if retry_cnt >= max_retry: raise RuntimeError('Failed to terminate the sky serve replica ' f'cluster {cluster_name}.') from e - logger.error('Failed to terminate the sky serve replica ' - f'cluster {cluster_name}. Retrying.' - f'Details: {common_utils.format_exception(e)}') + gap_seconds = backoff.current_backoff() + logger.error( + 'Failed to terminate the sky serve replica cluster ' + f'{cluster_name}. Retrying after {gap_seconds} seconds.' + f'Details: {common_utils.format_exception(e)}') logger.error(f' Traceback: {traceback.format_exc()}') + time.sleep(gap_seconds) def with_lock(func): @@ -289,11 +354,11 @@ class SkyPilotInfraProvider(InfraProvider): def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', task_yaml_path: str) -> None: super().__init__(service_name, spec) - self.task_yaml_path: str = task_yaml_path + self.task: sky.Task = sky.Task.from_yaml(task_yaml_path) self.launch_process_pool: serve_utils.ThreadSafeDict[ - int, subprocess.Popen] = serve_utils.ThreadSafeDict() + int, multiprocessing.Process] = serve_utils.ThreadSafeDict() self.down_process_pool: serve_utils.ThreadSafeDict[ - int, subprocess.Popen] = serve_utils.ThreadSafeDict() + int, multiprocessing.Process] = serve_utils.ThreadSafeDict() threading.Thread(target=self._process_pool_refresher).start() threading.Thread(target=self._job_status_fetcher).start() @@ -313,25 +378,23 @@ def get_ready_replica_ips(self) -> Set[str]: return ready_replicas def _launch_replica(self, replica_id: int) -> None: - cluster_name = serve_utils.generate_replica_cluster_name( - self.service_name, replica_id) if replica_id in self.launch_process_pool: logger.warning(f'Launch process for replica {replica_id} ' 'already exists. Skipping.') return - logger.info(f'Creating replica {replica_id}') - # TODO(tian): We should do usage_lib.messages.usage.set_internal() - # after we change to python API. - cmd = ['sky', 'launch', self.task_yaml_path, '-c', cluster_name, '-y'] - cmd.extend(['--detach-setup', '--detach-run', '--retry-until-up']) + logger.info(f'Launching replica {replica_id}') + cluster_name = serve_utils.generate_replica_cluster_name( + self.service_name, replica_id) log_file_name = serve_utils.generate_replica_launch_log_file_name( self.service_name, replica_id) - with open(log_file_name, 'w') as f: - # pylint: disable=consider-using-with - p = subprocess.Popen(cmd, - stdin=subprocess.DEVNULL, - stdout=f, - stderr=f) + p = multiprocessing.Process( + target=serve_utils.RedirectOutputTo( + launch_cluster, + log_file_name, + ).run, + args=(self.task, cluster_name), + ) + p.start() self.launch_process_pool[replica_id] = p info = ReplicaInfo(replica_id, cluster_name) serve_state.add_or_update_replica(self.service_name, replica_id, info) @@ -341,10 +404,10 @@ def scale_up(self, n: int) -> None: self._launch_replica(self.next_replica_id) self.next_replica_id += 1 - def _teardown_replica(self, replica_id: int, sync_down_logs: bool) -> None: + def _terminate_replica(self, replica_id: int, sync_down_logs: bool) -> None: if replica_id in self.down_process_pool: - logger.warning(f'Down process for replica {replica_id} already ' - 'exists. Skipping.') + logger.warning(f'Terminate process for replica {replica_id} ' + 'already exists. Skipping.') return def _sync_down_logs(): @@ -384,22 +447,23 @@ def _sync_down_logs(): info = serve_state.get_replica_info_from_id(self.service_name, replica_id) assert info is not None - cmd = ['sky', 'down', info.cluster_name, '-y'] log_file_name = serve_utils.generate_replica_down_log_file_name( self.service_name, replica_id) - with open(log_file_name, 'w') as f: - # pylint: disable=consider-using-with - p = subprocess.Popen(cmd, - stdin=subprocess.DEVNULL, - stdout=f, - stderr=f) + p = multiprocessing.Process( + target=serve_utils.RedirectOutputTo( + terminate_cluster, + log_file_name, + ).run, + args=(info.cluster_name,), + ) + p.start() self.down_process_pool[replica_id] = p info.status_property.sky_down_status = ProcessStatus.RUNNING serve_state.add_or_update_replica(self.service_name, replica_id, info) def scale_down(self, replica_ids: List[int]) -> None: for replica_id in replica_ids: - self._teardown_replica(replica_id, sync_down_logs=False) + self._terminate_replica(replica_id, sync_down_logs=False) ################################ # InfraProvider Daemon Threads # @@ -414,7 +478,7 @@ def _refresh_process_pool(self) -> None: corresponding replica. """ for replica_id, p in list(self.launch_process_pool.items()): - if p.poll() is not None: + if not p.is_alive(): # TODO(tian): Try-catch in process, and have an enum return # value to indicate which type of failure happened. # Currently we only have user code failure since the @@ -426,29 +490,29 @@ def _refresh_process_pool(self) -> None: info = serve_state.get_replica_info_from_id( self.service_name, replica_id) assert info is not None - if p.returncode != 0: + if p.exitcode != 0: logger.warning( f'Launch process for replica {replica_id} exited ' - f'abnormally with code {p.returncode}. Terminating...') + f'abnormally with code {p.exitcode}. Terminating...') info.status_property.sky_launch_status = ( ProcessStatus.FAILED) - self._teardown_replica(replica_id, sync_down_logs=True) + self._terminate_replica(replica_id, sync_down_logs=True) else: info.status_property.sky_launch_status = ( ProcessStatus.SUCCEEDED) serve_state.add_or_update_replica(self.service_name, replica_id, info) for replica_id, p in list(self.down_process_pool.items()): - if p.poll() is not None: + if not p.is_alive(): logger.info(f'Down process for replica {replica_id} finished.') del self.down_process_pool[replica_id] info = serve_state.get_replica_info_from_id( self.service_name, replica_id) assert info is not None - if p.returncode != 0: + if p.exitcode != 0: logger.error( f'Down process for replica {replica_id} exited ' - f'abnormally with code {p.returncode}.') + f'abnormally with code {p.exitcode}.') info.status_property.sky_down_status = ( ProcessStatus.FAILED) else: @@ -515,7 +579,7 @@ def _fetch_job_status(self) -> None: logger.warning( f'Service job for replica {info.replica_id} FAILED. ' 'Terminating...') - self._teardown_replica(info.replica_id, sync_down_logs=True) + self._terminate_replica(info.replica_id, sync_down_logs=True) def _job_status_fetcher(self) -> None: """Periodically fetch the service job status of all replicas.""" @@ -612,7 +676,7 @@ def _probe_all_replicas(self) -> None: serve_state.add_or_update_replica(self.service_name, info.replica_id, info) if should_teardown: - self._teardown_replica(info.replica_id, sync_down_logs=True) + self._terminate_replica(info.replica_id, sync_down_logs=True) def _replica_prober(self) -> None: """Periodically probe replicas.""" From 1fa207f6d4ea73f822b885c4f05480c1158d55f4 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 16 Oct 2023 13:25:01 -0700 Subject: [PATCH 131/223] fix pass task in multiprocessing --- sky/serve/infra_providers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/serve/infra_providers.py b/sky/serve/infra_providers.py index b6921684491..be09817ebf2 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/infra_providers.py @@ -39,7 +39,7 @@ _RETRY_INIT_GAP_SECONDS = 60 -def launch_cluster(task: sky.Task, +def launch_cluster(task_yaml_path: str, cluster_name: str, max_retry: int = 3) -> None: """Launch a sky serve replica cluster. @@ -52,6 +52,7 @@ def launch_cluster(task: sky.Task, or some error happened before provisioning and will happen again if retry. """ + task = sky.Task.from_yaml(task_yaml_path) retry_cnt = 0 backoff = common_utils.Backoff(_RETRY_INIT_GAP_SECONDS) while True: @@ -354,7 +355,7 @@ class SkyPilotInfraProvider(InfraProvider): def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', task_yaml_path: str) -> None: super().__init__(service_name, spec) - self.task: sky.Task = sky.Task.from_yaml(task_yaml_path) + self.task_yaml_path = task_yaml_path self.launch_process_pool: serve_utils.ThreadSafeDict[ int, multiprocessing.Process] = serve_utils.ThreadSafeDict() self.down_process_pool: serve_utils.ThreadSafeDict[ @@ -392,7 +393,7 @@ def _launch_replica(self, replica_id: int) -> None: launch_cluster, log_file_name, ).run, - args=(self.task, cluster_name), + args=(self.task_yaml_path, cluster_name), ) p.start() self.launch_process_pool[replica_id] = p From e2365e7abb29d0ec46575cf0ac301525b5f3fc19 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 16 Oct 2023 14:52:51 -0700 Subject: [PATCH 132/223] minor & fix smoke test --- sky/backends/backend_utils.py | 1 + tests/skyserve/http/azure.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 7b0da2b9ffb..80e06752b53 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2751,6 +2751,7 @@ def _refresh_service_record_no_lock( if not record['endpoint']: # Service controller is still initializing. Skipped refresh status. + record['status'] = serve_lib.ServiceStatus.CONTROLLER_INIT return record, None controller_name = record['controller_name'] diff --git a/tests/skyserve/http/azure.yaml b/tests/skyserve/http/azure.yaml index 3c4c474a184..2b67158fcd1 100644 --- a/tests/skyserve/http/azure.yaml +++ b/tests/skyserve/http/azure.yaml @@ -11,5 +11,5 @@ service: port: 8081 readiness_probe: path: /health - initial_delay_seconds: 20 + initial_delay_seconds: 200 replicas: 2 From 1e7936d3c14c95a3487a012d552c93c7f2c97934 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 16 Oct 2023 15:42:10 -0700 Subject: [PATCH 133/223] rename infra_provider to replica_manager --- sky/serve/README.md | 2 +- sky/serve/controller.py | 16 ++++++++-------- .../{infra_providers.py => replica_managers.py} | 16 ++++++++-------- sky/serve/serve_state.py | 9 +++++---- sky/serve/service.py | 12 ++++++------ 5 files changed, 28 insertions(+), 27 deletions(-) rename sky/serve/{infra_providers.py => replica_managers.py} (98%) diff --git a/sky/serve/README.md b/sky/serve/README.md index f8f33df6244..34c1406cd1e 100644 --- a/sky/serve/README.md +++ b/sky/serve/README.md @@ -12,7 +12,7 @@ Sky Serve has four key components: 1. Redirector - The HTTP server is responsible for receiving requests and redirecting them to healthy endpoints. 2. Load balancers - spread requests across healthy endpoints according to different policies. 3. Autoscalers - scale up and down the number of serving endpoints according to different policies and handle recovery of unhealthy endpoints. -4. Infra Providers - provides a uniform interface to talk to SkyPilot. +4. Replica Managers - provides a uniform interface to talk to SkyPilot. ## Usage diff --git a/sky/serve/controller.py b/sky/serve/controller.py index dde8f6eb01c..8b12c841e43 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -16,7 +16,7 @@ from sky import sky_logging from sky.serve import autoscalers from sky.serve import constants -from sky.serve import infra_providers +from sky.serve import replica_managers from sky.serve import serve_utils from sky.utils import env_options @@ -41,10 +41,10 @@ class SkyServeController: def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec, task_yaml: str, port: int) -> None: self.service_name = service_name - self.infra_provider: infra_providers.InfraProvider = ( - infra_providers.SkyPilotInfraProvider(service_name, - service_spec, - task_yaml_path=task_yaml)) + self.replica_manager: replica_managers.ReplicaManager = ( + replica_managers.SkyPilotReplicaManager(service_name, + service_spec, + task_yaml_path=task_yaml)) self.autoscaler: autoscalers.Autoscaler = ( autoscalers.RequestRateAutoscaler( service_spec, @@ -67,12 +67,12 @@ def _run_autoscaler(self): autoscalers.AutoscalerDecisionOperator.SCALE_UP): assert isinstance(scaling_option.target, int), scaling_option - self.infra_provider.scale_up(scaling_option.target) + self.replica_manager.scale_up(scaling_option.target) elif (scaling_option.operator == autoscalers.AutoscalerDecisionOperator.SCALE_DOWN): assert isinstance(scaling_option.target, list), scaling_option - self.infra_provider.scale_down(scaling_option.target) + self.replica_manager.scale_down(scaling_option.target) except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # monitor running. @@ -99,7 +99,7 @@ def load_balancer_sync(request: fastapi.Request): self.autoscaler.update_request_information(request_information) return { 'ready_replica_ips': - self.infra_provider.get_ready_replica_ips() + self.replica_manager.get_ready_replica_ips() } threading.Thread(target=self._run_autoscaler).start() diff --git a/sky/serve/infra_providers.py b/sky/serve/replica_managers.py similarity index 98% rename from sky/serve/infra_providers.py rename to sky/serve/replica_managers.py index be09817ebf2..cfa66c0c331 100644 --- a/sky/serve/infra_providers.py +++ b/sky/serve/replica_managers.py @@ -1,4 +1,4 @@ -"""InfraProvider: handles the creation and deletion of endpoint replicas.""" +"""ReplicaManager: handles the creation and deletion of endpoint replicas.""" from concurrent import futures import dataclasses import enum @@ -320,8 +320,8 @@ def probe( return self, False -class InfraProvider: - """Each infra provider manages one service.""" +class ReplicaManager: + """Each replica manager monitors one service.""" def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec') -> None: @@ -349,8 +349,8 @@ def scale_down(self, replica_ids: List[int]) -> None: raise NotImplementedError -class SkyPilotInfraProvider(InfraProvider): - """Infra provider for SkyPilot clusters.""" +class SkyPilotReplicaManager(ReplicaManager): + """Replica Manager for SkyPilot clusters.""" def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', task_yaml_path: str) -> None: @@ -466,9 +466,9 @@ def scale_down(self, replica_ids: List[int]) -> None: for replica_id in replica_ids: self._terminate_replica(replica_id, sync_down_logs=False) - ################################ - # InfraProvider Daemon Threads # - ################################ + ################################# + # ReplicaManager Daemon Threads # + ################################# @with_lock def _refresh_process_pool(self) -> None: diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 607add9f9fc..ce445e6f576 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -14,7 +14,7 @@ if typing.TYPE_CHECKING: import sky - from sky.serve import infra_providers + from sky.serve import replica_managers _DB_PATH = pathlib.Path(constants.SERVE_PREFIX) / 'services.db' _DB_PATH = _DB_PATH.expanduser().absolute() @@ -266,7 +266,7 @@ def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]: # === Replica functions === def add_or_update_replica(service_name: str, replica_id: int, - replica_info: 'infra_providers.ReplicaInfo') -> None: + replica_info: 'replica_managers.ReplicaInfo') -> None: """Adds a replica to the database.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( @@ -289,7 +289,7 @@ def remove_replica(service_name: str, replica_id: int) -> None: def get_replica_info_from_id( service_name: str, - replica_id: int) -> Optional['infra_providers.ReplicaInfo']: + replica_id: int) -> Optional['replica_managers.ReplicaInfo']: """Gets a replica info from the database.""" with db_utils.safe_cursor(_DB_PATH) as cursor: rows = cursor.execute( @@ -302,7 +302,8 @@ def get_replica_info_from_id( return None -def get_replica_infos(service_name: str) -> List['infra_providers.ReplicaInfo']: +def get_replica_infos( + service_name: str) -> List['replica_managers.ReplicaInfo']: """Gets all replica infos of a service.""" with db_utils.safe_cursor(_DB_PATH) as cursor: rows = cursor.execute( diff --git a/sky/serve/service.py b/sky/serve/service.py index a1eb4ee14fb..d2a1f5cc52c 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -19,8 +19,8 @@ from sky.backends import cloud_vm_ray_backend from sky.serve import constants from sky.serve import controller -from sky.serve import infra_providers from sky.serve import load_balancer +from sky.serve import replica_managers from sky.serve import serve_state from sky.serve import serve_utils from sky.utils import common_utils @@ -62,18 +62,18 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: """Clean up the sky serve replicas, storage, and service record.""" failed = False replica_infos = serve_state.get_replica_infos(service_name) - info2proc: Dict[infra_providers.ReplicaInfo, + info2proc: Dict[replica_managers.ReplicaInfo, multiprocessing.Process] = dict() for info in replica_infos: - p = multiprocessing.Process(target=infra_providers.terminate_cluster, + p = multiprocessing.Process(target=replica_managers.terminate_cluster, args=(info.cluster_name,)) p.start() info2proc[info] = p # Set replica status to `SHUTTING_DOWN` info.status_property.sky_launch_status = ( - infra_providers.ProcessStatus.SUCCEEDED) + replica_managers.ProcessStatus.SUCCEEDED) info.status_property.sky_down_status = ( - infra_providers.ProcessStatus.RUNNING) + replica_managers.ProcessStatus.RUNNING) serve_state.add_or_update_replica(service_name, info.replica_id, info) logger.info(f'Terminating replica {info.replica_id} ...') for info, p in info2proc.items(): @@ -84,7 +84,7 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: else: # Set replica status to `FAILED_CLEANUP` info.status_property.sky_down_status = ( - infra_providers.ProcessStatus.FAILED) + replica_managers.ProcessStatus.FAILED) serve_state.add_or_update_replica(service_name, info.replica_id, info) failed = True From 1aba2388ce5609979b0478849df89658f5475708 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 16 Oct 2023 16:40:12 -0700 Subject: [PATCH 134/223] ux --- sky/backends/cloud_vm_ray_backend.py | 22 ++++++----------- sky/cli.py | 3 +-- sky/execution.py | 37 +++++++++++++++------------- sky/serve/__init__.py | 2 +- sky/serve/autoscalers.py | 2 +- sky/serve/constants.py | 2 +- sky/serve/load_balancer.py | 4 +-- sky/serve/replica_managers.py | 3 ++- sky/serve/service_spec.py | 2 +- tests/test_smoke.py | 10 ++++---- 10 files changed, 42 insertions(+), 45 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4b42d137429..e44e2a48a32 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2674,7 +2674,6 @@ def __init__(self): self._dag = None self._optimize_target = None self._requested_features = set() - self._minimize_logging = False # Command for running the setup script. It is only set when the # setup needs to be run outside the self._setup() and as part of @@ -2690,8 +2689,6 @@ def register_info(self, **kwargs) -> None: self._optimize_target) or optimizer.OptimizeTarget.COST self._requested_features = kwargs.pop('requested_features', self._requested_features) - self._minimize_logging = kwargs.pop('minimize_logging', - self._minimize_logging) assert len(kwargs) == 0, f'Unexpected kwargs: {kwargs}' def check_resources_fit_cluster( @@ -3362,9 +3359,8 @@ def _exec_code_on_head( f'Failed to submit job {job_id}.', stderr=stdout + stderr) - if not self._minimize_logging: - logger.info('Job submitted with Job ID: ' - f'{style.BRIGHT}{job_id}{style.RESET_ALL}') + logger.info('Job submitted with Job ID: ' + f'{style.BRIGHT}{job_id}{style.RESET_ALL}') try: if not detach_run: @@ -3376,7 +3372,8 @@ def _exec_code_on_head( self.tail_logs(handle, job_id) finally: name = handle.cluster_name - if name == spot_lib.SPOT_CONTROLLER_NAME: + group = backend_utils.ReservedClusterGroup.get_group(name) + if group == backend_utils.ReservedClusterGroup.SPOT_CONTROLLER: logger.info( f'{fore.CYAN}Spot Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' @@ -3395,7 +3392,7 @@ def _exec_code_on_head( '\nTo view the spot job dashboard:\t' f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') - elif not self._minimize_logging: + elif group is None: # Disable this logging for sky serve controller logger.info(f'{fore.CYAN}Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' '\nTo cancel the job:\t' @@ -3512,7 +3509,8 @@ def _post_execute(self, handle: CloudVmRayResourceHandle, fore = colorama.Fore style = colorama.Style name = handle.cluster_name - if down or self._minimize_logging: + group = backend_utils.ReservedClusterGroup.get_group(name) + if group is not None or down: return stop_str = ('\nTo stop the cluster:' f'\t{backend_utils.BOLD}sky stop {name}' @@ -3662,10 +3660,6 @@ def cancel_jobs(self, f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout) cancelled_ids = common_utils.decode_payload(stdout) - - if self._minimize_logging: - return - if cancelled_ids: logger.info( f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}') @@ -4416,7 +4410,7 @@ def _check_existing_cluster( f'{cluster_name!r} [Username: {ssh_user}].' f'{colorama.Style.RESET_ALL}\n' 'Run `sky status` to see existing clusters.') - elif not self._minimize_logging: + else: logger.info( f'{colorama.Fore.CYAN}Creating a new cluster: {cluster_name!r} ' f'[{task.num_nodes}x {to_provision}].' diff --git a/sky/cli.py b/sky/cli.py index 47a4a6cc616..fdf3c49d4d6 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4126,7 +4126,7 @@ def serve_up( 'Specifying ports in resources is not allowed. SkyServe will ' 'use the port specified in the service section.') - click.secho('\nService Spec:', fg='cyan') + click.secho('Service Spec:', fg='cyan') click.echo(task.service) click.secho('Each replica will use the following resources (estimated):', @@ -4134,7 +4134,6 @@ def serve_up( with sky.Dag() as dag: dag.add(task) sky.optimize(dag) - click.echo() if not yes: prompt = f'Launching a new service {service_name!r}. Proceed?' diff --git a/sky/execution.py b/sky/execution.py index fdd37891758..62f84280b7d 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -173,10 +173,10 @@ def _execute( idle_minutes_to_autostop: Optional[int] = None, no_setup: bool = False, clone_disk_from: Optional[str] = None, - minimize_logging: bool = False, # Internal only: # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, + _is_launched_by_sky_serve_controller: bool = False, ) -> None: """Execute an entrypoint. @@ -313,8 +313,7 @@ def _execute( backend.register_info(dag=dag, optimize_target=optimize_target, - requested_features=requested_features, - minimize_logging=minimize_logging) + requested_features=requested_features) if task.storage_mounts is not None: # Optimizer should eventually choose where to store bucket @@ -367,14 +366,17 @@ def _execute( backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: - if not minimize_logging: + group = backend_utils.ReservedClusterGroup.get_group(cluster_name) + if group is None and not _is_launched_by_sky_serve_controller: # UX: print live clusters to make users aware (to save costs). # # Don't print if this job is launched by the spot controller, # because spot jobs are serverless, there can be many of them, and # users tend to continuously monitor spot jobs using `sky spot # status`. Also don't print if this job is a skyserve controller - # job. + # job or launched by a skyserve controller job, because the + # redirect for this subprocess.run won't success and it will + # pollute the controller logs. # # Disable the usage collection for this status command. env = dict(os.environ, @@ -403,6 +405,7 @@ def launch( # Internal only: # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, + _is_launched_by_sky_serve_controller: bool = False, ) -> None: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Launch a task. @@ -503,6 +506,8 @@ def launch( no_setup=no_setup, clone_disk_from=clone_disk_from, _is_launched_by_spot_controller=_is_launched_by_spot_controller, + _is_launched_by_sky_serve_controller= + _is_launched_by_sky_serve_controller, ) @@ -777,7 +782,6 @@ def spot_launch( idle_minutes_to_autostop=spot. SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, - minimize_logging=True, ) @@ -1073,7 +1077,7 @@ def serve_up( fore = colorama.Fore style = colorama.Style - print(f'\n{fore.YELLOW}Launching controller for {service_name!r}...' + print(f'{fore.YELLOW}Launching controller for {service_name!r}...' f'{style.RESET_ALL}') _execute( entrypoint=controller_task, @@ -1085,7 +1089,6 @@ def serve_up( # value and a previous controller could be reused. idle_minutes_to_autostop=serve.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, - minimize_logging=True, ) controller_record = global_user_state.get_cluster_from_name( @@ -1115,7 +1118,7 @@ def serve_up( print(f'{fore.GREEN}Launching controller for {service_name!r}...done.' f'{style.RESET_ALL}') - print(f'\n{fore.CYAN}Service name: ' + print(f'{fore.CYAN}Service name: ' f'{style.BRIGHT}{service_name}{style.RESET_ALL}' '\nTo see detailed info:' f'\t\t{backend_utils.BOLD}sky serve status {service_name} (-a)' @@ -1123,6 +1126,8 @@ def serve_up( '\nTo see logs of one replica:' f'\t{backend_utils.BOLD}sky serve logs {service_name} ' f'[REPLICA_ID]{backend_utils.RESET_BOLD}' + f'\n(use {backend_utils.BOLD}sky serve status {service_name}' + f'{backend_utils.RESET_BOLD} to get all valid [REPLICA_ID])' '\nTo see logs of load balancer:' f'\t{backend_utils.BOLD}sky serve logs --load-balancer ' f'{service_name}{backend_utils.RESET_BOLD}' @@ -1132,15 +1137,13 @@ def serve_up( '\nTo teardown the service:' f'\t{backend_utils.BOLD}sky serve down {service_name}' f'{backend_utils.RESET_BOLD}' - f'\n(use {backend_utils.BOLD}sky serve status {service_name}' - f'{backend_utils.RESET_BOLD} to get all valid REPLICA_ID)') - print(f'\n{style.BRIGHT}{fore.CYAN}Endpoint URL: ' - f'{style.RESET_ALL}{fore.CYAN}' - f'{endpoint}{style.RESET_ALL}') - print(f'{fore.GREEN}Starting replicas now...{style.RESET_ALL}') - print('\nTo monitor replica status:' + '\nTo monitor replica status:' f'\t{backend_utils.BOLD}watch -n10 sky serve status ' f'{service_name}{backend_utils.RESET_BOLD}' '\nTo send a test request:' f'\t\t{backend_utils.BOLD}curl -L $(sky serve status ' - f'{service_name} --endpoint){backend_utils.RESET_BOLD}') + f'{service_name} --endpoint){backend_utils.RESET_BOLD}' + f'\n{style.BRIGHT}{fore.CYAN}Endpoint URL: ' + f'{style.RESET_ALL}{fore.CYAN}' + f'{endpoint}{style.RESET_ALL}' + f'\n{fore.GREEN}Starting replicas now...{style.RESET_ALL}') diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 0544ff53fa9..531e559ee82 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -5,9 +5,9 @@ from sky.serve.constants import CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP from sky.serve.constants import CONTROLLER_PREFIX from sky.serve.constants import CONTROLLER_RESOURCES -from sky.serve.constants import CONTROLLER_SYNC_INTERVAL from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.constants import ENDPOINT_PROBE_INTERVAL +from sky.serve.constants import LB_CONTROLLER_SYNC_INTERVAL from sky.serve.constants import LOAD_BALANCER_PORT_RANGE from sky.serve.constants import SERVE_PREFIX from sky.serve.constants import SERVICES_TASK_CPU_DEMAND diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 533ba86b7b8..9dbfa80d626 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -55,7 +55,7 @@ def __init__(self, spec: 'service_spec.SkyServiceSpec', self.min_replicas: int = spec.min_replicas self.max_replicas: int = spec.max_replicas or spec.min_replicas self.frequency = frequency - if self.frequency < constants.CONTROLLER_SYNC_INTERVAL: + if self.frequency < constants.LB_CONTROLLER_SYNC_INTERVAL: logger.warning('Autoscaler frequency is less than ' 'controller sync interval. It might ' 'not always got the latest information.') diff --git a/sky/serve/constants.py b/sky/serve/constants.py index dde3be40c8a..5cc365fb107 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -33,7 +33,7 @@ # The time interval for load balancer to sync with controller. Every time the # load balancer syncs with controller, it will update all available replica ips # for each service, also send the number of requests in last query interval. -CONTROLLER_SYNC_INTERVAL = 20 +LB_CONTROLLER_SYNC_INTERVAL = 20 # Interval to probe replica endpoint. ENDPOINT_PROBE_INTERVAL = 10 diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 5b7a0cde8cc..449030a1bd4 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -40,7 +40,7 @@ def __init__(self, controller_url: str, load_balancer_port: int, def _sync_with_controller(self): """Sync with controller periodically. - Every `constants.CONTROLLER_SYNC_INTERVAL` seconds, the load balancer + Every `constants.LB_CONTROLLER_SYNC_INTERVAL` seconds, the load balancer will sync with the controller to get the latest information about available replicas; also, it report the request information to the controller, so that the controller can make autoscaling decisions. @@ -70,7 +70,7 @@ def _sync_with_controller(self): logger.info(f'Available Replica IPs: {ready_replica_ips}') self.load_balancing_policy.set_ready_replicas( ready_replica_ips) - time.sleep(constants.CONTROLLER_SYNC_INTERVAL) + time.sleep(constants.LB_CONTROLLER_SYNC_INTERVAL) async def _redirect_handler(self, request: fastapi.Request): self.request_information.add(request) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index cfa66c0c331..e787b4dec17 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -63,7 +63,8 @@ def launch_cluster(task_yaml_path: str, cluster_name, detach_setup=True, detach_run=True, - retry_until_up=True) + retry_until_up=True, + _is_launched_by_sky_serve_controller=True) logger.info(f'Replica cluster {cluster_name} launched.') except (exceptions.InvalidClusterNameError, exceptions.NoCloudAccessError, diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index b2703826403..f8a88f3157a 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -186,7 +186,7 @@ def __repr__(self) -> str: Readiness probe method: {self.probe_str()} Readiness initial delay seconds: {self.initial_delay_seconds} Replica autoscaling policy: {self.policy_str()} - Replica auto restart: {self.auto_restart} + Replica auto restart: {self.auto_restart}\ """) @property diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 2c1bc4e3d9c..8864484df4b 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2631,7 +2631,7 @@ def _get_service_name() -> str: # `REPLICAS` is in the form of `1/2` where the first number is the number of # ready replicas and the second number is the number of total replicas. We # grep such format to ensure that the service is ready, and early exit if any -# failure detected. In the end we sleep for serve.CONTROLLER_SYNC_INTERVAL to +# failure detected. In the end we sleep for serve.LB_CONTROLLER_SYNC_INTERVAL to # make sure load balancer have enough time to sync with the controller and get # all ready replica IPs. _SERVE_WAIT_UNTIL_READY = ( @@ -2640,7 +2640,7 @@ def _get_service_name() -> str: ' echo "$output" | grep -q "{replica_num}/{replica_num}" && break;' ' echo "$output" | grep -q "FAILED" && exit 1;' ' sleep 10;' - f' done); sleep {serve.CONTROLLER_SYNC_INTERVAL};') + f' done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL};') _IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}' _ENDPOINT_REGEX = _IP_REGEX + r':[0-9]{1,5}' _AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag' @@ -2751,11 +2751,11 @@ def terminate_replica(replica_id: int) -> str: f' --quiet $({query_cmd})') # In the worst case, the controller will first wait ENDPOINT_PROBE_INTERVAL - # for next probe, and wait CONTROLLER_SYNC_INTERVAL for load balancer's + # for next probe, and wait LB_CONTROLLER_SYNC_INTERVAL for load balancer's # next sync with controller. We add 5s more for any overhead, such as # database read/write. time_to_wait_after_terminate = (serve.ENDPOINT_PROBE_INTERVAL + - serve.CONTROLLER_SYNC_INTERVAL + 5) + serve.LB_CONTROLLER_SYNC_INTERVAL + 5) test = Test( f'test-skyserve-replica-failure', @@ -2818,7 +2818,7 @@ def terminate_replica(replica_id: int) -> str: f' output=$(sky serve status {name});' ' echo "$output" | grep -q "1/1" && break;' ' sleep 10;' - f'done); sleep {serve.CONTROLLER_SYNC_INTERVAL};', + f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL};', f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', ], f'sky serve down -y {name}', From 6d646a6410a796c283d08dc0d390eb38013c545b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 16 Oct 2023 22:29:45 -0700 Subject: [PATCH 135/223] not count as failure if UP for more than initial_delay_seconds --- sky/core.py | 2 +- sky/serve/replica_managers.py | 71 +++++++++++++++++++++++------------ 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/sky/core.py b/sky/core.py index c7031fadcdf..49e13cfd1b1 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1017,7 +1017,7 @@ def serve_status( 'controller_port': (Optional[int]) controller port, 'load_balancer_port': (Optional[int]) load balancer port, 'policy': (Optional[str]) load balancer policy description, - 'auto_restart': (bool) whether the service replcia will be + 'auto_restart': (bool) whether the service replica will be auto-restarted, 'requested_resources': (sky.Resources) requested resources for replica, diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index e787b4dec17..271eb05569a 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -157,7 +157,7 @@ class ReplicaStatusProperty: sky_launch_status: Process status of sky.launch. user_app_failed: Whether the service job failed. service_ready_now: Latest readiness probe result. - service_once_ready: Whether the service has been ready at least once. + first_ready_time: The first time the service is ready. sky_down_status: Process status of sky.down. """ # Initial value is RUNNING since each `ReplicaInfo` is created @@ -165,20 +165,36 @@ class ReplicaStatusProperty: sky_launch_status: ProcessStatus = ProcessStatus.RUNNING user_app_failed: bool = False service_ready_now: bool = False - service_once_ready: bool = False + # None means readiness probe is not passed yet. + first_ready_time: Optional[float] = None # None means sky.down is not called yet. sky_down_status: Optional[ProcessStatus] = None - def is_scale_down_succeeded(self) -> bool: + def is_scale_down_succeeded(self, initial_delay_seconds: int) -> bool: if self.sky_launch_status != ProcessStatus.SUCCEEDED: return False if self.sky_down_status != ProcessStatus.SUCCEEDED: return False + if (self.first_ready_time is not None and + time.time() - self.first_ready_time > initial_delay_seconds): + # If the service is up for more than `initial_delay_seconds`, + # we assume there is no bug in the user code and the scale down + # is successful, thus enabling the controller to remove the + # replica from the replica table and auto restart the replica. + # Here we assume that initial_delay_seconds is larger than + # consecutive_failure_threshold_seconds, so if a replica is not + # teardown for initial_delay_seconds, it is safe to assume that + # it is UP for initial_delay_seconds. + # For replica with a failed sky.launch, it is likely due to some + # misconfigured resources, so we don't want to auto restart it. + # For replica with a failed sky.down, we cannot restart it since + # otherwise we will have a resource leak. + return True if self.user_app_failed: return False if not self.service_ready_now: return False - return self.service_once_ready + return self.first_ready_time is not None def should_track_status(self) -> bool: if self.sky_launch_status != ProcessStatus.SUCCEEDED: @@ -203,7 +219,7 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: if self.user_app_failed: # Failed on user setup/run return serve_state.ReplicaStatus.FAILED - if not self.service_once_ready: + if self.first_ready_time is None: # initial delay seconds exceeded return serve_state.ReplicaStatus.FAILED if not self.service_ready_now: @@ -229,7 +245,7 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: # Failed on user setup/run # Same as above return serve_state.ReplicaStatus.FAILED_CLEANUP - if self.service_once_ready: + if self.first_ready_time is not None: # Service was ready before but not now return serve_state.ReplicaStatus.NOT_READY else: @@ -286,9 +302,14 @@ def probe( self, readiness_suffix: str, post_data: Optional[Dict[str, Any]], - ) -> Tuple['ReplicaInfo', bool]: - """Probe the readiness of the replica.""" + ) -> Tuple['ReplicaInfo', bool, float]: + """Probe the readiness of the replica. + + Returns: + Tuple of (self, is_ready, probe_time). + """ replica_ip = self.ip + probe_time = time.time() try: msg = '' # TODO(tian): Support HTTPS in the future. @@ -313,12 +334,12 @@ def probe( logger.info(msg) if response.status_code == 200: logger.info(f'Replica {replica_ip} is ready.') - return self, True + return self, True, probe_time except requests.exceptions.RequestException as e: logger.info(e) logger.info(f'Replica {replica_ip} is not ready.') pass - return self, False + return self, False, probe_time class ReplicaManager: @@ -523,8 +544,13 @@ def _refresh_process_pool(self) -> None: # Failed replica still count as a replica. In our current # design, we want to fail early if user code have any error. # This will prevent infinite loop of teardown and - # re-provision. - if info.status_property.is_scale_down_succeeded(): + # re-provision. However, there is a special case that if the + # replica is UP for longer than initial_delay_seconds, we + # assume it is just some random failure and we should restart + # the replica. Please refer to the implementation of + # `is_scale_down_succeeded` for more details. + if info.status_property.is_scale_down_succeeded( + self.initial_delay_seconds): # This means the cluster is deleted due to # a scale down. Delete the replica info # so it won't count as a replica. @@ -623,33 +649,32 @@ def _probe_all_replicas(self) -> None: # object as well, so that we could update the info object in the # same order. for future in futures.as_completed(probe_futures): - future_result: Tuple[ReplicaInfo, bool] = future.result() - info, probe_succeeded = future_result + future_result: Tuple[ReplicaInfo, bool, float] = future.result() + info, probe_succeeded, probe_time = future_result info.status_property.service_ready_now = probe_succeeded should_teardown = False if probe_succeeded: if self.uptime is None: - self.uptime = time.time() + self.uptime = probe_time logger.info(f'Replica {info.replica_id} is the first ready ' f'replica. Setting uptime to {self.uptime}.') serve_state.set_service_uptime(self.service_name, int(self.uptime)) info.consecutive_failure_times.clear() - if not info.status_property.service_once_ready: - info.status_property.service_once_ready = True + if info.status_property.first_ready_time is None: + info.status_property.first_ready_time = probe_time else: - current_time = time.time() if info.first_not_ready_time is None: - info.first_not_ready_time = current_time - if info.status_property.service_once_ready: - info.consecutive_failure_times.append(current_time) + info.first_not_ready_time = probe_time + if info.status_property.first_ready_time is not None: + info.consecutive_failure_times.append(probe_time) consecutive_failure_time = ( info.consecutive_failure_times[-1] - info.consecutive_failure_times[0]) if (consecutive_failure_time >= _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT): logger.info( - f'Replica {info.replica_id} is not ready for ' + f'Replica {info.replica_id} is not ready for ' 'too long and exceeding consecutive failure ' 'threshold. Terminating the replica...') should_teardown = True @@ -661,7 +686,7 @@ def _probe_all_replicas(self) -> None: f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' 'Skipping.') else: - current_delay_seconds = (current_time - + current_delay_seconds = (probe_time - info.first_not_ready_time) if current_delay_seconds > self.initial_delay_seconds: logger.info( From 32aa29471702bb78c17fbb78c6dd3b3ae2339eb1 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 18 Oct 2023 10:42:28 -0700 Subject: [PATCH 136/223] default auto restart to true --- sky/backends/backend_utils.py | 2 +- sky/serve/autoscalers.py | 16 +++++----------- sky/serve/replica_managers.py | 8 +++++--- sky/serve/service_spec.py | 6 +++--- 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 80e06752b53..53be3f9b008 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2722,7 +2722,7 @@ def _add_default_value_to_local_record( 'controller_port': None, 'load_balancer_port': None, 'policy': None, - 'auto_restart': False, + 'auto_restart': True, 'requested_resources': sky.Resources(), }) return record diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 9dbfa80d626..8f38cd04e53 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -45,13 +45,11 @@ def __init__(self, spec: 'service_spec.SkyServiceSpec', """Initialize the autoscaler. Variables: - auto_restart: Whether to restart failed replicas. min_replicas: Minimum number of replicas. max_replicas: Maximum number of replicas. Default to fixed number of replicas, i.e. min_replicas == max_replicas. frequency: Frequency of autoscaling in seconds. """ - self.auto_restart = spec.auto_restart self.min_replicas: int = spec.min_replicas self.max_replicas: int = spec.max_replicas or spec.min_replicas self.frequency = frequency @@ -111,16 +109,12 @@ def update_request_information( current_time - self.rps_window_size) self.request_timestamps = self.request_timestamps[index:] - def evaluate_scaling(self, infos: List[Dict[str, - Any]]) -> AutoscalerDecision: + def evaluate_scaling( + self, + infos: List[Dict[str, Any]], + ) -> AutoscalerDecision: current_time = time.time() - if not self.auto_restart: - num_replicas = len(infos) - else: - num_replicas = len([ - i for i in infos - if i['status'] != serve_state.ReplicaStatus.FAILED - ]) + num_replicas = len(infos) # Check if cooldown period has passed since the last scaling operation. # Only cooldown if bootstrapping is done. diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 271eb05569a..7cef7aa5c9b 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -170,12 +170,13 @@ class ReplicaStatusProperty: # None means sky.down is not called yet. sky_down_status: Optional[ProcessStatus] = None - def is_scale_down_succeeded(self, initial_delay_seconds: int) -> bool: + def is_scale_down_succeeded(self, initial_delay_seconds: int, + auto_restart: bool) -> bool: if self.sky_launch_status != ProcessStatus.SUCCEEDED: return False if self.sky_down_status != ProcessStatus.SUCCEEDED: return False - if (self.first_ready_time is not None and + if (auto_restart and self.first_ready_time is not None and time.time() - self.first_ready_time > initial_delay_seconds): # If the service is up for more than `initial_delay_seconds`, # we assume there is no bug in the user code and the scale down @@ -350,6 +351,7 @@ def __init__(self, service_name: str, self.lock = threading.Lock() self.next_replica_id: int = 1 self.service_name: str = service_name + self.auto_restart = spec.auto_restart self.readiness_suffix: str = spec.readiness_suffix self.initial_delay_seconds: int = spec.initial_delay_seconds self.post_data: Optional[Dict[str, Any]] = spec.post_data @@ -550,7 +552,7 @@ def _refresh_process_pool(self) -> None: # the replica. Please refer to the implementation of # `is_scale_down_succeeded` for more details. if info.status_property.is_scale_down_succeeded( - self.initial_delay_seconds): + self.initial_delay_seconds, self.auto_restart): # This means the cluster is deleted due to # a scale down. Delete the replica info # so it won't count as a replica. diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index f8a88f3157a..fecbceb6a1d 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -25,7 +25,7 @@ def __init__( qps_upper_threshold: Optional[float] = None, qps_lower_threshold: Optional[float] = None, post_data: Optional[Dict[str, Any]] = None, - auto_restart: bool = False, + auto_restart: bool = True, ) -> None: if min_replicas < 0: with ux_utils.print_exception_no_traceback(): @@ -104,7 +104,7 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': service_config['max_replicas'] = None service_config['qps_upper_threshold'] = None service_config['qps_lower_threshold'] = None - service_config['auto_restart'] = False + service_config['auto_restart'] = True else: service_config['min_replicas'] = policy_section['min_replicas'] service_config['max_replicas'] = policy_section.get( @@ -114,7 +114,7 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': service_config['qps_lower_threshold'] = policy_section.get( 'qps_lower_threshold', None) service_config['auto_restart'] = policy_section.get( - 'auto_restart', False) + 'auto_restart', True) return SkyServiceSpec(**service_config) From fa4fdfadfee81a55042bdb441be37ca01808a15a Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 18 Oct 2023 11:26:56 -0700 Subject: [PATCH 137/223] add todo --- sky/serve/replica_managers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 7cef7aa5c9b..ff243f85c74 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -551,6 +551,9 @@ def _refresh_process_pool(self) -> None: # assume it is just some random failure and we should restart # the replica. Please refer to the implementation of # `is_scale_down_succeeded` for more details. + # TODO(tian): Currently, restart replicas that failed within + # initial_delay_seconds is not supported. We should add it + # later when we support `sky serve update`. if info.status_property.is_scale_down_succeeded( self.initial_delay_seconds, self.auto_restart): # This means the cluster is deleted due to From 756a669e9252a20d7c1954ee2c16ef8bcde04a0c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 18 Oct 2023 16:03:12 -0700 Subject: [PATCH 138/223] upd test for auto_restart --- tests/skyserve/auto_restart.yaml | 4 +--- tests/skyserve/replica_failure/service.yaml | 4 +++- tests/test_smoke.py | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/skyserve/auto_restart.yaml b/tests/skyserve/auto_restart.yaml index 98c3d255e4e..5cd60653b76 100644 --- a/tests/skyserve/auto_restart.yaml +++ b/tests/skyserve/auto_restart.yaml @@ -12,6 +12,4 @@ service: readiness_probe: path: /health initial_delay_seconds: 20 - replica_policy: - min_replicas: 1 - auto_restart: true + replicas: 1 diff --git a/tests/skyserve/replica_failure/service.yaml b/tests/skyserve/replica_failure/service.yaml index cfba56168d8..7951f5696f8 100644 --- a/tests/skyserve/replica_failure/service.yaml +++ b/tests/skyserve/replica_failure/service.yaml @@ -15,4 +15,6 @@ service: path: /health # For install dependencies initial_delay_seconds: 180 - replicas: 3 + replica_policy: + min_replicas: 3 + auto_restart: false diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8864484df4b..6eda5b5feea 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2812,6 +2812,9 @@ def terminate_replica(replica_id: int) -> str: f'sky serve up -n {name} -y tests/skyserve/auto_restart.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', + # sleep for 20 seconds (initial delay) to make sure it will + # be restarted + f'sleep 20', terminate_replica(1), 'sleep 180', # Wait for consecutive failure timeout passed. '(while true; do' From ea992a0e4b6df2346a05fee8e16790578483024c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 20 Oct 2023 17:37:34 -0700 Subject: [PATCH 139/223] Use only one controller and remove local database. TODO: change to SERVICE_ID to avoid name conflict. --- sky/backends/backend_utils.py | 229 +++++++++++------------ sky/backends/cloud_vm_ray_backend.py | 41 +++- sky/cli.py | 267 +++++++++++---------------- sky/core.py | 152 +++++++-------- sky/execution.py | 122 +++--------- sky/global_user_state.py | 108 ----------- sky/serve/__init__.py | 5 +- sky/serve/constants.py | 15 +- sky/serve/serve_state.py | 47 +++-- sky/serve/serve_utils.py | 261 +++++++------------------- sky/serve/service.py | 18 +- sky/spot/__init__.py | 2 - sky/spot/spot_utils.py | 66 ------- sky/utils/cli_utils/status_utils.py | 25 ++- 14 files changed, 491 insertions(+), 867 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 53be3f9b008..63c38f64724 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -133,14 +133,14 @@ class ReservedClusterGroup(enum.Enum): 'managed spot controller. ')) SKY_SERVE_CONTROLLER = ReservedClusterRecord( group_name='Sky Serve controller', - check=lambda name: name.startswith(serve_lib.CONTROLLER_PREFIX), + check=lambda name: name == serve_lib.SKY_SERVE_CONTROLLER_NAME, sky_status_hint=( f'* To see detailed service status: {colorama.Style.BRIGHT}' f'sky serve status{colorama.Style.RESET_ALL}'), decline_cancel_hint=( 'Cancelling the sky serve controller\'s jobs is not allowed.'), check_cluster_name_hint=( - f'Cluster prefix {serve_lib.CONTROLLER_PREFIX} is reserved for ' + f'Cluster {serve_lib.SKY_SERVE_CONTROLLER_NAME} is reserved for ' 'sky serve controller. ')) @classmethod @@ -162,10 +162,6 @@ def get_group(cls, name: Optional[str]) -> Optional['ReservedClusterGroup']: CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock') CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20 -# Filelocks for the service status change. -SERVICE_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.service.{}.lock') -SERVICE_STATUS_LOCK_TIMEOUT_SECONDS = 20 - # Remote dir that holds our runtime files. _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files' @@ -2558,6 +2554,84 @@ def check_cluster_available( return handle +# TODO(tian): Probably use ReservedClusterGroup and add some attr for the msg? +def is_controller_up( + is_spot: bool, + stopped_message: str, + non_existent_message: Optional[str] = None, +) -> Tuple[Optional[status_lib.ClusterStatus], + Optional['backends.CloudVmRayResourceHandle']]: + """Check if the spot/serve controller is up. + + It can be used to check the actual controller status (since the autostop is + set for the controller) before the spot/serve commands interact with the + controller. + + Args: + is_spot: Whether the type of the controller is spot. + stopped_message: Message to print if the controller is STOPPED. + non_existent_message: Message to show if the controller does not exist. + + Returns: + controller_status: The status of the controller. If it fails during + refreshing the status, it will be the cached status. None if the + controller does not exist. + handle: The ResourceHandle of the controller. None if the + controller is not UP or does not exist. + + Raises: + exceptions.ClusterOwnerIdentityMismatchError: if the current user is not + the same as the user who created the cluster. + exceptions.CloudUserIdentityError: if we fail to get the current user + identity. + """ + if is_spot: + controller_name = spot_lib.SPOT_CONTROLLER_NAME + controller_hint = 'spot' + if non_existent_message is None: + non_existent_message = 'No managed spot jobs are found.' + else: + controller_name = serve_lib.SKY_SERVE_CONTROLLER_NAME + controller_hint = 'sky serve' + if non_existent_message is None: + non_existent_message = 'No service is found.' + try: + # Set force_refresh_statuses=None to make sure the refresh only happens + # when the controller is INIT/UP (triggered in these statuses as the + # autostop is always set for the controller). This optimization avoids + # unnecessary costly refresh when the controller is already stopped. + # This optimization is based on the assumption that the user will not + # start the controller manually from the cloud console. + controller_status, handle = refresh_cluster_status_handle( + controller_name, force_refresh_statuses=None) + except exceptions.ClusterStatusFetchingError as e: + # We do not catch the exceptions related to the cluster owner identity + # mismatch, please refer to the comment in + # `backend_utils.check_cluster_available`. + logger.warning( + 'Failed to get the status of the controller. It is not ' + f'fatal, but {controller_hint} commands/calls may hang or return ' + 'stale information, when the controller is not up.\n' + f' Details: {common_utils.format_exception(e, use_bracket=True)}') + record = global_user_state.get_cluster_from_name(controller_name) + controller_status, handle = None, None + if record is not None: + controller_status, handle = record['status'], record['handle'] + + if controller_status is None: + sky_logging.print(non_existent_message) + elif controller_status != status_lib.ClusterStatus.UP: + msg = (f'{controller_hint.capitalize()} controller {controller_name} ' + f'is {controller_status.value}.') + if controller_status == status_lib.ClusterStatus.STOPPED: + msg += f'\n{stopped_message}' + if controller_status == status_lib.ClusterStatus.INIT: + msg += '\nPlease wait for the controller to be ready.' + sky_logging.print(msg) + handle = None + return controller_status, handle + + class CloudFilter(enum.Enum): # Filter for all types of clouds. ALL = 'all' @@ -2710,138 +2784,59 @@ def _refresh_cluster(cluster_name): return kept_records -def _add_default_value_to_local_record( - record: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: - # NOTE(dev): Keep this align with sky.serve.controller.get_latest_info - if record is None: - return record - record.update({ - 'replica_info': [], - 'uptime': None, - 'status': serve_lib.ServiceStatus.UNKNOWN, - 'controller_port': None, - 'load_balancer_port': None, - 'policy': None, - 'auto_restart': True, - 'requested_resources': sky.Resources(), - }) - return record - - -def _refresh_service_record_no_lock( - service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: - """Refresh the service, and return the possibly updated record. +def refresh_service_status( + service_names: Optional[List[str]]) -> List[Dict[str, Any]]: + """Refresh the status of the services. Args: - service_name: The name of the service. + service_names: If provided, only refresh the status of the specified + services. Otherwise, refresh the status of all services. Returns: - A tuple of a possibly updated record and an error message if any error - occurred when refreshing the service. + A list of updated service records. """ - record = global_user_state.get_service_from_name(service_name) - if record is None: - return None, None - _add_default_value_to_local_record(record) - try: check_network_connection() except exceptions.NetworkError: - return record, 'Failed to refresh replica info due to network error.' - - if not record['endpoint']: - # Service controller is still initializing. Skipped refresh status. - record['status'] = serve_lib.ServiceStatus.CONTROLLER_INIT - return record, None - - controller_name = record['controller_name'] - status, handle = refresh_cluster_status_handle(controller_name) - if status is None or status == status_lib.ClusterStatus.STOPPED: - return record, None + logger.warning('Failed to refresh service status due to network error.') + return [] + + # TODO(tian): This is so slow... It will take ~10s to refresh the status + # of controller. Can we optimize this? + controller_status, handle = is_controller_up( + is_spot=False, stopped_message='No service is found.') + + if handle is None or handle.head_ip is None: + # When the controller is STOPPED, the head_ip will be None, as + # it will be set in global_user_state.remove_cluster(). + # We do not directly check for UP because the controller may be + # in INIT state during another spot launch, but still have + # head_ip available. In this case, we can still try to ssh + # into the controller and fetch the job table. + raise exceptions.ClusterNotUpError('Sky serve controller is not up.', + cluster_status=controller_status) backend = get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) - code = serve_lib.ServeCodeGen.get_latest_info(service_name) + code = serve_lib.ServeCodeGen.get_latest_info(service_names) returncode, latest_info_payload, stderr = backend.run_on_head( handle, code, require_outputs=True, stream_logs=False, separate_stderr=True) - if returncode != 0: - return record, stderr - - latest_info = serve_lib.load_latest_info(latest_info_payload) - record.update(latest_info) - return record, None - -def _refresh_service_record( - service_name: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: try: - with filelock.FileLock(SERVICE_STATUS_LOCK_PATH.format(service_name), - SERVICE_STATUS_LOCK_TIMEOUT_SECONDS): - return _refresh_service_record_no_lock(service_name) - except filelock.Timeout: - msg = ('Failed get the lock for service ' - f'{service_name!r}. Using the cached record.') - return _add_default_value_to_local_record( - global_user_state.get_service_from_name(service_name)), msg - - -# TODO(tian): We can optimize the number of ssh connections, by querying the -# statuses of multiple services in a single ssh -def refresh_service_status( - service_names: Optional[Union[str, List[str]]]) -> List[Dict[str, Any]]: - yellow = colorama.Fore.YELLOW - bright = colorama.Style.BRIGHT - reset = colorama.Style.RESET_ALL - - records = global_user_state.get_services() - if service_names is not None: - if isinstance(service_names, str): - service_names = [service_names] - new_records = [] - not_exist_service_names = [] - for service_name in service_names: - for record in records: - if record['name'] == service_name: - new_records.append(record) - break - else: - not_exist_service_names.append(service_name) - if not_exist_service_names: - services_str = ', '.join(not_exist_service_names) - logger.info(f'Service(s) not found: {bright}{services_str}{reset}.') - records = new_records - - service_names = [record['name'] for record in records] - - plural = 's' if len(service_names) > 1 else '' - progress = rich_progress.Progress(transient=True, - redirect_stdout=False, - redirect_stderr=False) - task = progress.add_task( - (f'[bold cyan]Refreshing status for {len(service_names)} ' - f'service{plural}[/]'), - total=len(service_names)) - - def _refresh_service(service_name: str) -> Optional[Dict[str, Any]]: - record, msg = _refresh_service_record(service_name) - if msg is not None: - progress.stop() - print(f'{yellow}Error occurred when refreshing service ' - f'{service_name}: {msg}{reset}') - progress.start() - progress.update(task, advance=1) - return record - - with progress: - updated_records = subprocess_utils.run_in_parallel( - _refresh_service, service_names) + subprocess_utils.handle_returncode(returncode, + code, + 'Failed to fetch services', + stderr, + stream_logs=False) + except exceptions.CommandError as e: + raise RuntimeError(e.error_msg) from e - return [record for record in updated_records if record is not None] + return serve_lib.load_latest_info(latest_info_payload) # Internal only: diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index e44e2a48a32..480b1bdaec0 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3274,6 +3274,7 @@ def _exec_code_on_head( executable: str, detach_run: bool = False, spot_dag: Optional['dag.Dag'] = None, + service_name: Optional[str] = None, ) -> None: """Executes generated code on the head node.""" style = colorama.Style @@ -3392,7 +3393,39 @@ def _exec_code_on_head( '\nTo view the spot job dashboard:\t' f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') - elif group is None: # Disable this logging for sky serve controller + elif (group == + backend_utils.ReservedClusterGroup.SKY_SERVE_CONTROLLER): + sn = service_name + logger.info( + f'{fore.CYAN}Service name: ' + f'{style.BRIGHT}{sn}{style.RESET_ALL}' + '\nTo see detailed info:\t\t' + f'{backend_utils.BOLD}sky serve status {sn} (-a)' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of one replica:\t' + f'{backend_utils.BOLD}sky serve logs {sn} [REPLICA_ID]' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of load balancer:\t' + f'{backend_utils.BOLD}sky serve logs --load-balancer {sn}' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of controller:\t' + f'{backend_utils.BOLD}sky serve logs --controller {sn}' + f'{backend_utils.RESET_BOLD}' + '\nTo teardown the service:\t\t' + f'{backend_utils.BOLD}sky serve down {sn}' + f'{backend_utils.RESET_BOLD}' + '\nTo monitor replica status:\t' + f'{backend_utils.BOLD}watch -n10 sky serve status {sn}' + f'{backend_utils.RESET_BOLD}' + '\nTo send a test request:\t\t' + f'{backend_utils.BOLD}curl -L $(sky serve status {sn} ' + f'--endpoint){backend_utils.RESET_BOLD}' + f'\n(use {backend_utils.BOLD}sky serve status {sn}' + f'{backend_utils.RESET_BOLD} to get all valid [REPLICA_ID])' + f'\n{style.BRIGHT}{fore.GREEN}SkyServe is bootstrapping ' + 'your service now. The endpoint and replicas should be ' + f'ready within a short time.{style.RESET_ALL}') + else: logger.info(f'{fore.CYAN}Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' '\nTo cancel the job:\t' @@ -4715,7 +4748,8 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, job_id, executable='python3', detach_run=detach_run, - spot_dag=task.spot_dag) + spot_dag=task.spot_dag, + service_name=task.service_name) def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, task: task_lib.Task, job_id: int, @@ -4789,4 +4823,5 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, job_id, executable='python3', detach_run=detach_run, - spot_dag=task.spot_dag) + spot_dag=task.spot_dag, + service_name=task.service_name) diff --git a/sky/cli.py b/sky/cli.py index fdf3c49d4d6..4543f3ed4ce 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -148,18 +148,6 @@ def _get_glob_storages(storages: List[str]) -> List[str]: return list(set(glob_storages)) -def _get_glob_services(service_names: List[str]) -> List[str]: - """Returns a list of service names that match the glob pattern.""" - glob_service_names = [] - for service_name in service_names: - glob_service_name = global_user_state.get_glob_service_names( - service_name) - if not glob_service_name: - click.echo(f'Service {service_name} not found.') - glob_service_names.extend(glob_service_name) - return list(set(glob_service_names)) - - def _warn_if_local_cluster(cluster: str, local_clusters: List[str], message: str) -> bool: """Raises warning if the cluster name is a local cluster.""" @@ -499,13 +487,6 @@ def _complete_cluster_name(ctx: click.Context, param: click.Parameter, return global_user_state.get_cluster_names_start_with(incomplete) -def _complete_service_name(ctx: click.Context, param: click.Parameter, - incomplete: str) -> List[str]: - """Handle shell completion for service names.""" - del ctx, param # Unused. - return global_user_state.get_glob_service_names(f'{incomplete}*') - - def _complete_storage_name(ctx: click.Context, param: click.Parameter, incomplete: str) -> List[str]: """Handle shell completion for storage names.""" @@ -2744,24 +2725,46 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): - services = global_user_state.get_services_from_controller_name( + cluster_status, _ = backend_utils.refresh_cluster_status_handle( controller_name) - if services: - # TODO(tian): When we switch to database for storing replica - # information, we could check total replicas of each service and - # allow terminating the controller if there is no existing replicas. - service_names = [service['name'] for service in services] + if cluster_status is None: + click.echo('Sky serve controller has already been torn down.') + return + + if cluster_status == status_lib.ClusterStatus.INIT: + # TODO(tian): Refactor to reserved group record. with ux_utils.print_exception_no_traceback(): - plural = '' if len(service_names) == 1 else 's' raise exceptions.NotSupportedError( f'{colorama.Fore.RED}Tearing down the sky serve controller ' - f'is not supported, as it is currently serving the following ' - f'service{plural}: {", ".join(service_names)}. Please teardown ' - f'the service{plural} first with {colorama.Style.BRIGHT}sky ' - f'serve down {" ".join(service_names)}' + 'while it is in INIT state is not supported (this means a sky ' + 'serve up is in progress or the previous launch failed), as we ' + 'cannot guarantee that all the services are terminated. Please ' + 'wait until the sky serve controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{serve_lib.SKY_SERVE_CONTROLLER_NAME}' f'{colorama.Style.RESET_ALL}.') - msg = f'Tearing down sky serve controller: {controller_name}.' - click.echo(msg) + elif cluster_status == status_lib.ClusterStatus.UP: + with rich_utils.safe_status( + '[bold cyan]Checking for running services[/]'): + try: + services = core.serve_status() + except exceptions.ClusterNotUpError: + cluster_status = backend_utils.refresh_cluster_status_handle( + controller_name) + services = [] + if services: + service_names = [service['name'] for service in services] + with ux_utils.print_exception_no_traceback(): + plural = '' if len(service_names) == 1 else 's' + raise exceptions.NotSupportedError( + f'{colorama.Fore.RED}Tearing down the sky serve controller ' + f'is not supported, as it is currently serving the ' + f'following service{plural}: {", ".join(service_names)}. ' + f'Please terminate the service{plural} first with ' + f'{colorama.Style.BRIGHT}sky serve down ' + f'{" ".join(service_names)}{colorama.Style.RESET_ALL}.') + # Do nothing for STOPPED state, as it is safe to terminate the cluster. + click.echo(f'Terminate sky serve controller: {controller_name}.') _RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE = { @@ -3932,8 +3935,9 @@ def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): # Cancel managed spot jobs with IDs 1, 2, 3 $ sky spot cancel 1 2 3 """ - _, handle = spot_lib.is_spot_controller_up( - 'All managed spot jobs should have finished.') + _, handle = backend_utils.is_controller_up( + is_spot=True, + stopped_message='All managed spot jobs should have finished.') if handle is None: # Hint messages already printed by the call above. sys.exit(1) @@ -4016,7 +4020,8 @@ def spot_dashboard(port: Optional[int]): hint = ( 'Dashboard is not available if spot controller is not up. Run a spot ' 'job first.') - _, handle = spot_lib.is_spot_controller_up(stopped_message=hint, + _, handle = backend_utils.is_controller_up(is_spot=True, + stopped_message=hint, non_existent_message=hint) if handle is None: sys.exit(1) @@ -4095,16 +4100,6 @@ def serve_up( if service_name is None: service_name = backend_utils.generate_service_name() - previous_service_record = global_user_state.get_service_from_name( - service_name) - if previous_service_record is not None: - prompt = (f'Service {service_name!r} already exists. ' - 'Updating a service will be supported in the future. ' - 'For now, clean up the service and restart: ' - f'sky serve down {service_name}') - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(prompt) - is_yaml, _ = _check_yaml(''.join(entrypoint)) if not is_yaml: raise click.UsageError( @@ -4155,11 +4150,7 @@ def serve_up( is_flag=True, required=False, help='Show service endpoint.') -@click.argument('service_names', - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_service_name)) +@click.argument('service_names', required=False, type=str, nargs=-1) @usage_lib.entrypoint # pylint: disable=redefined-builtin def serve_status(all: bool, endpoint: bool, service_names: List[str]): @@ -4238,32 +4229,47 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): # Only show status of my-service sky serve status my-service """ + # This won't pollute the output of --endpoint. + with rich_utils.safe_status('[cyan]Checking services[/]'): + # TODO(tian): Add this to `sky status` as well. + msg = None + try: + with sky_logging.silent(): + if not service_names: + service_records = core.serve_status(None) + else: + service_records = core.serve_status(service_names) + except exceptions.ClusterNotUpError as e: + controller_status = e.cluster_status + if controller_status == status_lib.ClusterStatus.INIT: + msg = 'Controller is initializing. Please wait for a while.' + else: + assert controller_status in [ + None, status_lib.ClusterStatus.STOPPED + ] + msg = 'No existing services.' + except RuntimeError as e: + msg = ('Failed to fetch service statuses due to connection issues. ' + 'Please try again later. Details: ' + f'{common_utils.format_exception(e, use_bracket=True)}') + except Exception as e: # pylint: disable=broad-except + msg = ('Failed to fetch service statuses: ' + f'{common_utils.format_exception(e, use_bracket=True)}') + if msg is not None: + click.echo(msg) + return + if endpoint: - if len(service_names) != 1: + if len(service_records) != 1: plural = 's' if len(service_names) > 1 else '' service_num = (str(len(service_names)) if len(service_names) > 0 else 'No') raise click.UsageError( - f'{service_num} service{plural} specified. Please specify an' + f'{service_num} service{plural} found. Please specify an' ' existing service to show its endpoint. Usage: ' '`sky serve status --endpoint `') - service_name = service_names[0] - record = global_user_state.get_service_from_name(service_name) - if record is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'\nService {service_name!r} not found.') - service_endpoint = record['endpoint'] - if service_endpoint is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'Endpoint not found for service {service_name!r}. ' - 'Please check whether the service is ready.') - click.echo(service_endpoint) + click.echo(status_utils.get_endpoint(service_records[0])) return - query_services: Optional[List[str]] = None - if service_names: - query_services = _get_glob_services(service_names) - service_records = core.serve_status(query_services) click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' f'{colorama.Style.RESET_ALL}') status_utils.show_service_table(service_records, all) @@ -4272,26 +4278,16 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): replica_infos = [] for service_record in service_records: for replica_record in service_record['replica_info']: - # Only print FAILED replicas if: - # 1. --all is specified; - # 2. auto_restart is not enabled (in which FAILED replica count - # as one replica). - if (all or not service_record['auto_restart'] or - replica_record['status'] != serve_lib.ReplicaStatus.FAILED): - replica_record['service_name'] = service_record['name'] - replica_infos.append(replica_record) + replica_record['service_name'] = service_record['name'] + replica_infos.append(replica_record) status_utils.show_replica_table(replica_infos, all) @serve.command('down', cls=_DocumentedCodeCommand) -@click.argument('service_names', - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_service_name)) +@click.argument('service_names', required=False, type=str, nargs=-1) @click.option('--all', '-a', - default=None, + default=False, is_flag=True, help='Stop all existing clusters.') @click.option('--yes', @@ -4300,18 +4296,8 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): default=False, required=False, help='Skip confirmation prompt.') -@click.option('--purge', - '-p', - is_flag=True, - default=False, - required=False, - help='Ignore errors (if any). ') -def serve_down( - service_names: List[str], - all: Optional[bool], # pylint: disable=redefined-builtin - yes: bool, - purge: bool, -): +# pylint: disable=redefined-builtin +def serve_down(service_names: List[str], all: bool, yes: bool): """Teardown service(s). SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If @@ -4337,65 +4323,32 @@ def serve_down( sky serve down -a """ - if not service_names and not all: + service_names_str = ','.join(service_names) + if sum([len(service_names) > 0, all]) != 1: + argument_str = f'SERVICE_NAMES={service_names_str}' if len( + service_names) > 0 else '' + argument_str += ' --all' if all else '' raise click.UsageError( - '`sky serve down` requires either a service name (see ' - '`sky serve status`) or --all to be specified.') - if all: - service_names = [ - record['name'] for record in global_user_state.get_services() - ] - else: - service_names = _get_glob_services(service_names) + 'Can only specify one of SERVICE_NAMES or --all. ' + f'Provided {argument_str!r}.') - if not service_names: - click.echo('\nService(s) not found (tip: see `sky serve status`).') - return + _, handle = backend_utils.is_controller_up( + is_spot=False, + stopped_message='All services should have been terminated.') + if handle is None: + # Hint messages already printed by the call above. + sys.exit(1) - plural = '' if len(service_names) == 1 else 's' if not yes: - service_name_list = ', '.join(service_names) - click.confirm( - f'Tearing down {len(service_names)} service{plural}: ' - f'{service_name_list}. Proceed?', - default=True, - abort=True, - show_default=True) - - progress = rich_progress.Progress(transient=True, - redirect_stdout=False, - redirect_stderr=False) - task = progress.add_task( - f'[bold cyan]Tearing down {len(service_names)} service{plural}[/]', - total=len(service_names)) - - def _down_service(name: str): - success_progress = False - try: - sky.serve_down(name, purge) - except RuntimeError as e: - message = ( - f'{colorama.Fore.RED}Tearing down service {name!r}...failed. ' - 'Please manually clean up the replicas and then use --purge ' - f'to clean up the controller.{colorama.Style.RESET_ALL}' - f'\nReason: {common_utils.format_exception(e)}.') - else: - message = ( - f'{colorama.Fore.GREEN}Tearing down service {name!r}...done.' - f'{colorama.Style.RESET_ALL}') - success_progress = True - - progress.stop() - click.echo(message) - if success_progress: - progress.update(task, advance=1) - progress.start() + service_identity_str = f'services with name {service_names_str}' + if all: + service_identity_str = 'all services' + click.confirm(f'Terminating {service_identity_str}. Proceed?', + default=True, + abort=True, + show_default=True) - with progress: - subprocess_utils.run_in_parallel(_down_service, service_names) - progress.live.transient = False - # Make sure the progress bar not mess up the terminal. - progress.refresh() + sky.serve_down(service_names=service_names, all=all) @serve.command('logs', cls=_DocumentedCodeCommand) @@ -4421,10 +4374,7 @@ def _down_service(name: str): case_sensitive=False), required=False, help='Target to stream logs.') -@click.argument('service_name', - required=True, - type=str, - **_get_shell_complete_args(_complete_service_name)) +@click.argument('service_name', required=True, type=str) @click.argument('replica_id', required=False, type=int) @usage_lib.entrypoint def serve_logs( @@ -4460,6 +4410,7 @@ def serve_logs( # Tail the controller logs of a service: sky serve logs --controller --target load-balancer [SERVICE_ID] """ + # TODO(tian): nit: use sum([...]) have_replica_id = replica_id is not None num_flags = (controller + load_balancer + have_replica_id) if num_flags > 1: @@ -4489,10 +4440,14 @@ def serve_logs( 'REPLICA_ID must be specified when using --target replica.') else: target_component = sky.ServiceComponent.REPLICA - core.serve_tail_logs(service_name, - target=target_component, - replica_id=replica_id, - follow=follow) + try: + core.serve_tail_logs(service_name, + target=target_component, + replica_id=replica_id, + follow=follow) + except exceptions.ClusterNotUpError: + # Hint messages already printed by the call above. + sys.exit(1) # ============================== diff --git a/sky/core.py b/sky/core.py index 49e13cfd1b1..6a8766a2658 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1,6 +1,5 @@ """SDK functions for cluster/job management.""" import getpass -import os import sys from typing import Any, Dict, List, Optional, Union @@ -21,7 +20,6 @@ from sky.skylet import constants from sky.skylet import job_lib from sky.usage import usage_lib -from sky.utils import common_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import tpu_utils @@ -795,7 +793,8 @@ def spot_queue(refresh: bool, stop_msg = '' if not refresh: stop_msg = 'To view the latest job table: sky spot queue --refresh' - controller_status, handle = spot.is_spot_controller_up(stop_msg) + controller_status, handle = backend_utils.is_controller_up( + is_spot=True, stopped_message=stop_msg) if (refresh and controller_status in [ status_lib.ClusterStatus.STOPPED, status_lib.ClusterStatus.INIT @@ -866,10 +865,11 @@ def spot_cancel(name: Optional[str] = None, RuntimeError: failed to cancel the job. """ job_ids = [] if job_ids is None else job_ids - cluster_status, handle = spot.is_spot_controller_up( - 'All managed spot jobs should have finished.') + cluster_status, handle = backend_utils.is_controller_up( + is_spot=True, + stopped_message='All managed spot jobs should have finished.') if handle is None or handle.head_ip is None: - # The error message is already printed in spot.is_spot_controller_up. + # The error message is already printed in backend_utils.is_controller_up # TODO(zhwu): Move the error message into the exception. with ux_utils.print_exception_no_traceback(): raise exceptions.ClusterNotUpError('', @@ -924,9 +924,10 @@ def spot_tail_logs(name: Optional[str], job_id: Optional[int], sky.exceptions.ClusterNotUpError: the spot controller is not up. """ # TODO(zhwu): Automatically restart the spot controller - controller_status, handle = spot.is_spot_controller_up( - 'Please restart the spot controller with ' - f'`sky start {spot.SPOT_CONTROLLER_NAME}`.') + controller_status, handle = backend_utils.is_controller_up( + is_spot=True, + stopped_message=('Please restart the spot controller with ' + f'`sky start {spot.SPOT_CONTROLLER_NAME}`.')) if handle is None or handle.head_ip is None: msg = 'All jobs finished.' if controller_status == status_lib.ClusterStatus.INIT: @@ -1045,6 +1046,9 @@ def serve_status( A list of dicts, with each dict containing the information of a service. If a service is not found, it will be omitted from the returned list. """ + if service_names is not None: + if isinstance(service_names, str): + service_names = [service_names] return backend_utils.refresh_service_status(service_names) @@ -1105,24 +1109,14 @@ def serve_tail_logs( with ux_utils.print_exception_no_traceback(): raise ValueError('`replica_id` must be None when using ' 'target=CONTROLLER/LOAD_BALANCER.') - controller_name = global_user_state.get_service_controller_name( - service_name) - if controller_name is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name!r} does not exist. ' - 'Cannot stream logs.') - controller_status, handle = backend_utils.refresh_cluster_status_handle( - controller_name) - if controller_status is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'Cannot find controller for service {service_name}.') - if controller_status == status_lib.ClusterStatus.STOPPED: - with ux_utils.print_exception_no_traceback(): - raise exceptions.ClusterNotUpError( - f'Controller for service {service_name} is auto-stopped.', - cluster_status=controller_status) - assert isinstance(handle, backends.CloudVmRayResourceHandle), handle + controller_status, handle = backend_utils.is_controller_up( + is_spot=False, stopped_message='No service is found.') + if handle is None or handle.head_ip is None: + msg = 'No service is found.' + if controller_status == status_lib.ClusterStatus.INIT: + msg = '' + raise exceptions.ClusterNotUpError(msg, + cluster_status=controller_status) backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend), backend backend.tail_serve_logs(handle, @@ -1133,71 +1127,65 @@ def serve_tail_logs( @usage_lib.entrypoint -def serve_down(service_name: str, purge: bool = False) -> None: +# pylint: disable=redefined-builtin +def serve_down(service_names: Optional[Union[str, List[str]]] = None, + all: bool = False) -> None: """Teardown a service. Please refer to the sky.cli.serve_down for the document. Args: - service_name: Name of the service. - purge: If true, ignore errors when cleaning up the controller. + service_names: Name of the service(s). Raises: - ValueError: if the service does not exist. + sky.exceptions.ClusterNotUpError: if the sky serve controller is not up. + ValueError: if the arguments are invalid. RuntimeError: if failed to terminate the service. """ - controller_name = global_user_state.get_service_controller_name( - service_name) - - if controller_name is None: + if service_names is None: + service_names = [] + if isinstance(service_names, str): + service_names = [service_names] + cluster_status, handle = backend_utils.is_controller_up( + is_spot=False, stopped_message='All services should have terminated.') + if handle is None or handle.head_ip is None: + # The error message is already printed in backend_utils.is_controller_up + # TODO(zhwu): Move the error message into the exception. with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service {service_name!r} not found.') - - handle = global_user_state.get_handle_from_cluster_name(controller_name) + raise exceptions.ClusterNotUpError('', + cluster_status=cluster_status) - controller_fetch_ip_error_message = ( - 'Failed to fetch controller IP. Please refresh controller status by ' - '`sky status -r ` and try again.') + service_names_str = ','.join(service_names) + if sum([len(service_names) > 0, all]) != 1: + argument_str = f'service_names={service_names_str}' if len( + service_names) > 0 else '' + argument_str += ' all' if all else '' + raise ValueError('Can only specify one of service_names or all. ' + f'Provided {argument_str!r}.') - if handle is None: - if not purge: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'Cannot find controller of service {service_name!r}.') + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + if all: + code = serve.ServeCodeGen.terminate_services(None) else: - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend) - try: - code = serve.ServeCodeGen.terminate_service(service_name) - - try: - returncode, stdout, _ = backend.run_on_head( - handle, code, require_outputs=True, stream_logs=False) - except exceptions.FetchIPError as e: - raise RuntimeError(controller_fetch_ip_error_message) from e - - subprocess_utils.handle_returncode( - returncode, - code, ('Failed when submit termination request to controller ' - f'of service {service_name!r}'), - stdout, - stream_logs=False) - - # We want to make sure no matter what error happens, we can still - # clean up the record if purge is True. - # pylint: disable=broad-except - except Exception as e: - if purge: - logger.warning('Ignoring error when terminate ' - f'service {service_name!r}: ' - f'{common_utils.format_exception(e)}') - else: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError(e) from e - - # TODO(tian): Maybe add a post_cleanup function? - controller_yaml_path = serve.generate_controller_yaml_file_name( - service_name) - if os.path.exists(controller_yaml_path): - os.remove(controller_yaml_path) - global_user_state.remove_service(service_name) + code = serve.ServeCodeGen.terminate_services(service_names) + + try: + returncode, stdout, _ = backend.run_on_head(handle, + code, + require_outputs=True, + stream_logs=False) + except exceptions.FetchIPError as e: + raise RuntimeError( + 'Failed to fetch controller IP. Please refresh controller ' + f'status by `sky status -r {serve.SKY_SERVE_CONTROLLER_NAME}` ' + 'and try again.') from e + + try: + subprocess_utils.handle_returncode(returncode, code, + 'Failed to terminate service', + stdout) + except exceptions.CommandError as e: + raise RuntimeError(e.error_msg) from e + + sky_logging.print(stdout) diff --git a/sky/execution.py b/sky/execution.py index 62f84280b7d..d099eeaf69d 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -8,12 +8,10 @@ import os import re import tempfile -import time from typing import Any, Dict, List, Optional, Union import uuid import colorama -import filelock import sky from sky import backends @@ -983,11 +981,6 @@ def serve_up( 'only contains lower letters, numbers and dash): ' f'{constants.CLUSTER_NAME_VALID_REGEX}') - if global_user_state.get_service_from_name(service_name) is not None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service name {service_name!r} is already ' - 'taken. Please use a different name.') - if task.service is None: with ux_utils.print_exception_no_traceback(): raise RuntimeError('Service section not found.') @@ -1010,8 +1003,8 @@ def serve_up( raise ValueError( _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format( controller_type='serve', - err=common_utils.format_exception(e, - use_bracket=True))) from e + err=common_utils.format_exception(e, use_bracket=True), + )) from e assert task.service is not None, task assert len(task.resources) == 1, task @@ -1025,49 +1018,43 @@ def serve_up( task.set_resources( requested_resources.copy(ports=[task.service.replica_port])) - # Use filelock here to make sure only one process can write to database - # at the same time. Then we generate available controller name again to - # make sure even in race condition, we can still get the correct controller - # name. - # In the same time, generate ports for the controller and load balancer. - # Use file lock to make sure the ports are unique to each service. - with filelock.FileLock(os.path.expanduser(serve.CONTROLLER_FILE_LOCK_PATH)): - controller_name = serve.get_available_controller_name() - - global_user_state.add_service(service_name, - launched_at=int(time.time()), - controller_name=controller_name) - # TODO(tian): Probably run another sky.launch after we get - # the load balancer port from the controller? So we don't - # need to open so many ports here. - controller_resources = controller_resources.copy( - ports=[serve.LOAD_BALANCER_PORT_RANGE]) + controller_name = serve.SKY_SERVE_CONTROLLER_NAME + # TODO(tian): Probably run another sky.launch after we get the load balancer + # port from the controller? So we don't need to open so many ports here. Or, + # we should have a nginx traffic control to refuse any connection to the + # unregistered ports. + # TODO(tian): Probably choose the same cloud if replica cloud is specified? + controller_resources = controller_resources.copy( + ports=[serve.LOAD_BALANCER_PORT_RANGE]) _maybe_translate_local_file_mounts_and_sync_up(task, prefix='serve') - with tempfile.NamedTemporaryFile(prefix=f'serve-task-{service_name}-', - mode='w') as f: + with tempfile.NamedTemporaryFile( + prefix=f'service-task-{service_name}-', + mode='w', + ) as service_file, tempfile.NamedTemporaryFile( + prefix=f'controller-task-{service_name}-', + mode='w', + ) as controller_file: task_config = task.to_yaml_config() - common_utils.dump_yaml(f.name, task_config) - remote_task_yaml_path = serve.generate_remote_task_yaml_file_name( - service_name) + common_utils.dump_yaml(service_file.name, task_config) + remote_task_yaml_path = ( + serve.generate_remote_task_yaml_file_name(service_name)) controller_log_file = ( serve.generate_remote_controller_log_file_name(service_name)) vars_to_fill = { 'remote_task_yaml_path': remote_task_yaml_path, - 'local_task_yaml_path': f.name, + 'local_task_yaml_path': service_file.name, 'google_sdk_installation_commands': gcp.GOOGLE_SDK_INSTALLATION_COMMAND, 'service_name': service_name, 'controller_log_file': controller_log_file, 'envs': _shared_controller_env_vars(), } - controller_yaml_path = serve.generate_controller_yaml_file_name( - service_name) backend_utils.fill_template(serve.CONTROLLER_TEMPLATE, vars_to_fill, - output_path=controller_yaml_path) - controller_task = task_lib.Task.from_yaml(controller_yaml_path) + output_path=controller_file.name) + controller_task = task_lib.Task.from_yaml(controller_file.name) controller_task.set_resources(controller_resources) # Set this to modify default ray task CPU usage to custom value @@ -1075,10 +1062,8 @@ def serve_up( # to support a larger number of services. controller_task.service_name = service_name - fore = colorama.Fore - style = colorama.Style - print(f'{fore.YELLOW}Launching controller for {service_name!r}...' - f'{style.RESET_ALL}') + print(f'{colorama.Fore.YELLOW}Launching controller for ' + f'{service_name!r}...{colorama.Style.RESET_ALL}') _execute( entrypoint=controller_task, stream_logs=False, @@ -1090,60 +1075,3 @@ def serve_up( idle_minutes_to_autostop=serve.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) - - controller_record = global_user_state.get_cluster_from_name( - controller_name) - assert controller_record is not None - handle = controller_record['handle'] - assert isinstance(handle, backends.CloudVmRayResourceHandle) - with rich_utils.safe_status( - '[cyan]Waiting for service initialization...[/]'): - code = serve.ServeCodeGen.wait_for_load_balancer_port(service_name) - backend = backends.CloudVmRayBackend() - returncode, lb_port_payload, stderr = backend.run_on_head( - handle, - code, - require_outputs=True, - stream_logs=False, - separate_stderr=True) - subprocess_utils.handle_returncode( - returncode, code, - ('Failed to get load balancer port for service ' - f'{service_name!r}.'), stderr) - load_balancer_port = serve.decode_load_balancer_port( - lb_port_payload) - endpoint = f'{handle.head_ip}:{load_balancer_port}' - global_user_state.set_service_endpoint(service_name, endpoint) - - print(f'{fore.GREEN}Launching controller for {service_name!r}...done.' - f'{style.RESET_ALL}') - - print(f'{fore.CYAN}Service name: ' - f'{style.BRIGHT}{service_name}{style.RESET_ALL}' - '\nTo see detailed info:' - f'\t\t{backend_utils.BOLD}sky serve status {service_name} (-a)' - f'{backend_utils.RESET_BOLD}' - '\nTo see logs of one replica:' - f'\t{backend_utils.BOLD}sky serve logs {service_name} ' - f'[REPLICA_ID]{backend_utils.RESET_BOLD}' - f'\n(use {backend_utils.BOLD}sky serve status {service_name}' - f'{backend_utils.RESET_BOLD} to get all valid [REPLICA_ID])' - '\nTo see logs of load balancer:' - f'\t{backend_utils.BOLD}sky serve logs --load-balancer ' - f'{service_name}{backend_utils.RESET_BOLD}' - '\nTo see logs of controller:' - f'\t{backend_utils.BOLD}sky serve logs --controller ' - f'{service_name}{backend_utils.RESET_BOLD}' - '\nTo teardown the service:' - f'\t{backend_utils.BOLD}sky serve down {service_name}' - f'{backend_utils.RESET_BOLD}' - '\nTo monitor replica status:' - f'\t{backend_utils.BOLD}watch -n10 sky serve status ' - f'{service_name}{backend_utils.RESET_BOLD}' - '\nTo send a test request:' - f'\t\t{backend_utils.BOLD}curl -L $(sky serve status ' - f'{service_name} --endpoint){backend_utils.RESET_BOLD}' - f'\n{style.BRIGHT}{fore.CYAN}Endpoint URL: ' - f'{style.RESET_ALL}{fore.CYAN}' - f'{endpoint}{style.RESET_ALL}' - f'\n{fore.GREEN}Starting replicas now...{style.RESET_ALL}') diff --git a/sky/global_user_state.py b/sky/global_user_state.py index b4316746df1..6f94aa8098d 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -25,7 +25,6 @@ if typing.TYPE_CHECKING: from sky import backends - from sky import serve from sky.data import Storage _ENABLED_CLOUDS_KEY = 'enabled_clouds' @@ -93,13 +92,6 @@ def create_table(cursor, conn): handle BLOB, last_use TEXT, status TEXT)""") - # Table for Services - cursor.execute("""\ - CREATE TABLE IF NOT EXISTS services ( - name TEXT PRIMARY KEY, - launched_at INTEGER, - controller_name TEXT, - endpoint TEXT DEFAULT NULL)""") # For backward compatibility. # TODO(zhwu): Remove this function after all users have migrated to # the latest version of SkyPilot. @@ -280,30 +272,6 @@ def add_or_update_cluster(cluster_name: str, _DB.conn.commit() -def add_service(name: str, launched_at: int, controller_name: str) -> None: - _DB.cursor.execute( - 'INSERT INTO services' - '(name, launched_at, controller_name) ' - 'VALUES (' - # name - '?, ' - # launched_at - '?, ' - # controller_name - '?' - ')', - ( - # name - name, - # launched_at - launched_at, - # controller_name - controller_name, - )) - - _DB.conn.commit() - - def update_last_use(cluster_name: str): """Updates the last used command for the cluster.""" _DB.cursor.execute('UPDATE clusters SET last_use=(?) WHERE name=(?)', @@ -345,21 +313,6 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None: _DB.conn.commit() -def remove_service(service_name: str): - _DB.cursor.execute('DELETE FROM services WHERE name=(?)', (service_name,)) - _DB.conn.commit() - - -def set_service_endpoint(service_name: str, endpoint: str): - _DB.cursor.execute('UPDATE services SET endpoint=(?) ' - 'WHERE name=(?)', (endpoint, service_name)) - count = _DB.cursor.rowcount - _DB.conn.commit() - assert count <= 1, count - if count == 0: - raise ValueError(f'Service {service_name} not found.') - - def get_handle_from_cluster_name( cluster_name: str) -> Optional['backends.ResourceHandle']: assert cluster_name is not None, 'cluster_name cannot be None' @@ -377,13 +330,6 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]: return [row[0] for row in rows] -def get_glob_service_names(service_name: str) -> List[str]: - assert service_name is not None, 'service_name cannot be None' - rows = _DB.cursor.execute('SELECT name FROM services WHERE name GLOB (?)', - (service_name,)) - return [row[0] for row in rows] - - def set_cluster_status(cluster_name: str, status: status_lib.ClusterStatus) -> None: _DB.cursor.execute('UPDATE clusters SET status=(?) WHERE name=(?)', ( @@ -588,50 +534,6 @@ def get_cluster_from_name( return None -def _get_service_from_row(row) -> Dict[str, Any]: - # Explicitly specify the number of fields to unpack, so that - # we can add new fields to the database in the future without - # breaking the previous code. - name, launched_at, controller_name, endpoint = row[:4] - # TODO: use namedtuple instead of dict - return { - 'name': name, - 'launched_at': launched_at, - 'controller_name': controller_name, - 'endpoint': endpoint, - } - - -def get_service_from_name( - service_name: Optional[str]) -> Optional[Dict[str, Any]]: - rows = _DB.cursor.execute('SELECT * FROM services WHERE name=(?)', - (service_name,)).fetchall() - for row in rows: - return _get_service_from_row(row) - return None - - -def get_service_controller_name(service_name: Optional[str]) -> Optional[str]: - rows = _DB.cursor.execute( - 'SELECT controller_name FROM services WHERE name=(?)', - (service_name,)).fetchall() - for (controller_name,) in rows: - return controller_name - return None - - -def get_services_from_controller_name( - controller_name: str) -> List[Dict[str, Any]]: - rows = _DB.cursor.execute( - 'SELECT * FROM services WHERE controller_name=(?)', - (controller_name,)).fetchall() - records = [] - for row in rows: - record = _get_service_from_row(row) - records.append(record) - return records - - def get_clusters() -> List[Dict[str, Any]]: rows = _DB.cursor.execute( 'select * from clusters order by launched_at desc').fetchall() @@ -658,16 +560,6 @@ def get_clusters() -> List[Dict[str, Any]]: return records -def get_services() -> List[Dict[str, Any]]: - rows = _DB.cursor.execute( - 'select * from services order by launched_at desc').fetchall() - records = [] - for row in rows: - record = _get_service_from_row(row) - records.append(record) - return records - - def get_clusters_from_history() -> List[Dict[str, Any]]: rows = _DB.cursor.execute( 'SELECT ch.cluster_hash, ch.name, ch.num_nodes, ' diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 531e559ee82..fd2dc70f666 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -3,7 +3,6 @@ from sky.serve.constants import CONTROLLER_FILE_LOCK_PATH from sky.serve.constants import CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP -from sky.serve.constants import CONTROLLER_PREFIX from sky.serve.constants import CONTROLLER_RESOURCES from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.constants import ENDPOINT_PROBE_INTERVAL @@ -13,15 +12,13 @@ from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.serve_state import ReplicaStatus from sky.serve.serve_state import ServiceStatus -from sky.serve.serve_utils import decode_load_balancer_port -from sky.serve.serve_utils import generate_controller_yaml_file_name from sky.serve.serve_utils import generate_remote_controller_log_file_name from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name -from sky.serve.serve_utils import get_available_controller_name from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent +from sky.serve.serve_utils import SKY_SERVE_CONTROLLER_NAME from sky.serve.service_spec import SkyServiceSpec os.makedirs(os.path.expanduser(SERVE_PREFIX), exist_ok=True) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 5cc365fb107..69497a953f0 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -2,11 +2,6 @@ CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 -# A prefix for all controller clusters. We use this prefix to identify a -# skyserve controller cluster. We will append a user hash and an incremental -# id to this prefix to generate a unique controller cluster name every time. -CONTROLLER_PREFIX = 'sky-serve-controller-' - CONTROLLER_TEMPLATE = 'sky-serve-controller.yaml.j2' SERVE_PREFIX = '~/.sky/serve' @@ -24,12 +19,6 @@ # Signal file path for controller to handle signals. SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}' -# Timeout for `sky serve down`. -SERVICE_TERMINATION_TIMEOUT = 180 - -# Timeout for waiting controller to find a port for service processes. -SERVICE_PORT_SELECTION_TIMEOUT = 60 - # The time interval for load balancer to sync with controller. Every time the # load balancer syncs with controller, it will update all available replica ips # for each service, also send the number of requests in last query interval. @@ -72,7 +61,9 @@ # set the memory usage to 2 GB to be safe. # In this setup, a default highmem controller with 4 vCPU and 32 GB memory can # run 16 services. -SERVICES_MEMORY_USAGE_GB = 2.0 +# TODO(tian): Since now we only have one job, we set this to 1 GB. Should do +# some benchmark to make sure this is safe. +SERVICES_MEMORY_USAGE_GB = 1.0 SERVICES_TASK_CPU_DEMAND = 0.125 # A period of time to initialize your service. Any readiness probe failures diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index ce445e6f576..3de99a333e9 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -120,18 +120,22 @@ class ServiceStatus(enum.Enum): # Service is being shutting down SHUTTING_DOWN = 'SHUTTING_DOWN' - # Cannot connect to controller - UNKNOWN = 'UNKNOWN' - # At least one replica is failed and no replica is ready FAILED = 'FAILED' # Clean up failed FAILED_CLEANUP = 'FAILED_CLEANUP' + # Max service number is reached and the service is pending + PENDING = 'PENDING' + @classmethod def failed_statuses(cls) -> List['ServiceStatus']: - return [cls.CONTROLLER_FAILED, cls.UNKNOWN, cls.FAILED_CLEANUP] + return [cls.CONTROLLER_FAILED, cls.FAILED_CLEANUP] + + @classmethod + def refuse_to_terminate_statuses(cls) -> List['ServiceStatus']: + return [cls.CONTROLLER_FAILED, cls.FAILED_CLEANUP, cls.SHUTTING_DOWN] def colored_str(self) -> str: color = _SERVICE_STATUS_TO_COLOR[self] @@ -156,19 +160,16 @@ def from_replica_statuses( ServiceStatus.CONTROLLER_FAILED: colorama.Fore.RED, ServiceStatus.READY: colorama.Fore.GREEN, ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, - ServiceStatus.UNKNOWN: colorama.Fore.YELLOW, + ServiceStatus.PENDING: colorama.Fore.YELLOW, ServiceStatus.FAILED: colorama.Fore.RED, ServiceStatus.FAILED_CLEANUP: colorama.Fore.RED, } # === Service functions === -def add_service(name: str, - controller_job_id: int, - policy: str, - auto_restart: bool, - requested_resources: 'sky.Resources', - status: ServiceStatus = ServiceStatus.CONTROLLER_INIT) -> None: +def add_service(name: str, controller_job_id: int, policy: str, + auto_restart: bool, requested_resources: 'sky.Resources', + status: ServiceStatus) -> None: """Adds a service to the database.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( @@ -264,6 +265,30 @@ def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]: return None +def get_glob_service_names( + service_names: Optional[List[str]] = None) -> List[str]: + """Get service names matching the glob patterns. + + Args: + service_names: A list of glob patterns. If None, return all service + names. + + Returns: + A list of non-duplicated service names. + """ + with db_utils.safe_cursor(_DB_PATH) as cursor: + if service_names is None: + rows = cursor.execute('SELECT name FROM services').fetchall() + else: + rows = [] + for service_name in service_names: + rows.extend( + cursor.execute( + 'SELECT name FROM services WHERE name GLOB (?)', + (service_name,)).fetchall()) + return list({row[0] for row in rows}) + + # === Replica functions === def add_or_update_replica(service_name: str, replica_id: int, replica_info: 'replica_managers.ReplicaInfo') -> None: diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index e60e13858d5..58f4f6de40e 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -9,11 +9,12 @@ import threading import time import typing -from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, Set, +from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, TextIO, Type, TypeVar) import colorama import filelock +import psutil from sky import backends from sky import exceptions @@ -27,7 +28,10 @@ if typing.TYPE_CHECKING: import fastapi - import sky +SKY_SERVE_CONTROLLER_NAME = ( + f'sky-serve-controller-{common_utils.get_user_hash()}') +_SYSTEM_MEMORY_GB = psutil.virtual_memory().total // (1024**3) +NUM_SERVICE_THRESHOLD = _SYSTEM_MEMORY_GB // constants.SERVICES_MEMORY_USAGE_GB _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*' @@ -175,46 +179,6 @@ def run(self, *args, **kwargs): self.func(*args, **kwargs) -def _get_existing_controller_names() -> Set[str]: - """Get existing sky serve controller names. - - There is two possible indicators for a controller: - 1. It is in the cluster database, which means it is already created; - 2. It is not in the cluster database but in the service database, - which means it will be created later in the future. This usually - happens when multiple `sky serve up` are running simultaneously. - - Returns: - A set of existing sky serve controller names. - """ - controller_in_service_db = { - record['controller_name'] - for record in global_user_state.get_services() - } - controller_in_cluster_db = { - record['name'] - for record in global_user_state.get_clusters() - if record['name'].startswith(constants.CONTROLLER_PREFIX) - } - return controller_in_service_db | controller_in_cluster_db - - -def generate_controller_cluster_name(existing_controllers: Set[str]) -> str: - index = 0 - while True: - controller_name = (f'{constants.CONTROLLER_PREFIX}' - f'{common_utils.get_user_hash()}-{index}') - if controller_name not in existing_controllers: - return controller_name - index += 1 - - -def generate_controller_yaml_file_name(service_name: str) -> str: - service_name = service_name.replace('-', '_') - prefix = os.path.expanduser(constants.SERVE_PREFIX) - return os.path.join(prefix, f'{service_name}_controller.yaml') - - def generate_remote_service_dir_name(service_name: str) -> str: service_name = service_name.replace('-', '_') return os.path.join(constants.SERVE_PREFIX, service_name) @@ -263,76 +227,6 @@ def generate_replica_cluster_name(service_name: str, replica_id: int) -> str: return f'{service_name}-{replica_id}' -def _get_service_slot_on_controller(controller_name: str) -> int: - """Get the number of slots to run services on the controller. - - A controller only have limited available slots for a new services. - Max number of slots on a controller is determined by the memory of - the controller, since ray job and our skypilot code is very memory - demanding (~1GB/service). - - Args: - controller_name: The name of the controller. - - Returns: - Number of slots on the controller. - """ - controller_memory = 0. - # Wait for the controller to be created. This could happen if multiple - # `sky serve up` are running simultaneously. - while True: - controller_record = global_user_state.get_cluster_from_name( - controller_name) - if controller_record is not None: - handle = controller_record['handle'] - assert isinstance(handle, backends.CloudVmRayResourceHandle) - # Determine max number of services on this controller. - controller_cloud = handle.launched_resources.cloud - _, controller_memory = ( - controller_cloud.get_vcpus_mem_from_instance_type( - handle.launched_resources.instance_type)) - assert controller_memory is not None - break - time.sleep(5) - # Determine max number of services on this controller. - max_services_num = int(controller_memory / - constants.SERVICES_MEMORY_USAGE_GB) - # Get current number of services on this controller. - services_num_on_controller = len( - global_user_state.get_services_from_controller_name(controller_name)) - return max_services_num - services_num_on_controller - - -def get_available_controller_name() -> str: - """Get available controller name to use. - - Only consider controllers that have available slots for services. - If multiple controllers are available, choose the one with most number of - services to decrease the number of controllers. - This function needs to be called within a lock, to avoid concurrency issue - from `existing_controllers` being staled, also, to avoid multiple - `sky serve up` select the same last slot on a controller. - - Returns: - Controller name to use. - """ - # Get all existing controllers. - existing_controllers = _get_existing_controller_names() - controller2slots = dict() - # Get a mapping from controller name to number of services on it. - for controller_name in existing_controllers: - num_slots = _get_service_slot_on_controller(controller_name) - # Only consider controllers that have available slot for services. - if num_slots > 0: - controller2slots[controller_name] = num_slots - if not controller2slots: - return generate_controller_cluster_name(existing_controllers) - # If multiple controllers are available, choose the one with least number of - # slots, i.e. most number of services. This helps to decrease the number of - # controllers. - return min(controller2slots.keys(), key=lambda k: controller2slots[k]) - - def set_service_status_from_replica_statuses( service_name: str, replica_statuses: List[serve_state.ReplicaStatus]) -> None: @@ -392,8 +286,6 @@ def get_latest_info(service_name: str, Returns: A dictionary of latest information of the service. """ - # NOTE(dev): Keep this align with - # sky.backends.backend_utils._add_default_value_to_local_record record = serve_state.get_service_from_name(service_name) if record is None: raise ValueError(f'Service {service_name!r} does not exist.') @@ -403,51 +295,60 @@ def get_latest_info(service_name: str, return record -def get_latest_info_encoded(service_name: str) -> str: - latest_info = get_latest_info(service_name) - latest_info = { - k: base64.b64encode(pickle.dumps(v)).decode('utf-8') - for k, v in latest_info.items() - } - return common_utils.encode_payload(latest_info) - - -def load_latest_info(payload: str) -> Dict[str, Any]: - latest_info = common_utils.decode_payload(payload) - latest_info = { - k: pickle.loads(base64.b64decode(v)) for k, v in latest_info.items() - } - return latest_info - - -def terminate_service(service_name: str) -> None: - # Send the terminate signal to controller. - signal_file = pathlib.Path(constants.SIGNAL_FILE_PATH.format(service_name)) - # Filelock is needed to prevent race condition between signal - # check/removal and signal writing. - with filelock.FileLock(str(signal_file) + '.lock'): - with signal_file.open(mode='w') as f: - f.write(UserSignal.TERMINATE.value) - f.flush() - print(f'Service {service_name!r} is scheduled to be terminated.') - for _ in range(constants.SERVICE_TERMINATION_TIMEOUT): - record = serve_state.get_service_from_name(service_name) - replica_infos = serve_state.get_replica_infos(service_name) - if record is None: - if not replica_infos: - return - elif record['status'] == serve_state.ServiceStatus.FAILED_CLEANUP: - raise RuntimeError( - f'Failed to terminate service {service_name!r}. Some ' - 'resources are not cleaned up properly. Please SSH to ' - 'the controller and manually clean up them. Find the ' - 'replicas that not been terminated by `sky serve status ' - f'{service_name!r}`.') - time.sleep(1) - raise RuntimeError( - f'Failed to terminate service {service_name!r}: timeout ' - f'after {constants.SERVICE_TERMINATION_TIMEOUT} seconds. ' - 'Please try again later.') +def get_latest_info_encoded(service_names: Optional[List[str]]) -> str: + latest_infos = [] + if service_names is None: + # Get all service names + service_names = serve_state.get_glob_service_names(None) + for service_name in service_names: + latest_info = get_latest_info(service_name) + latest_infos.append({ + k: base64.b64encode(pickle.dumps(v)).decode('utf-8') + for k, v in latest_info.items() + }) + return common_utils.encode_payload(latest_infos) + + +def load_latest_info(payload: str) -> List[Dict[str, Any]]: + latest_infos_encoded = common_utils.decode_payload(payload) + latest_infos = [] + for latest_info in latest_infos_encoded: + latest_infos.append({ + k: pickle.loads(base64.b64decode(v)) + for k, v in latest_info.items() + }) + return latest_infos + + +def terminate_services(service_names: Optional[List[str]]) -> str: + service_names = serve_state.get_glob_service_names(service_names) + terminated_service_names = [] + for service_name in service_names: + latest_info = get_latest_info(service_name, with_replica_info=False) + if (latest_info['status'] + in serve_state.ServiceStatus.refuse_to_terminate_statuses()): + # TODO(tian): Cleanup replicas for CONTROLLER_FAILED status. Seems + # like spot doesn't implement this yet? + continue + # Send the terminate signal to controller. + signal_file = pathlib.Path( + constants.SIGNAL_FILE_PATH.format(service_name)) + # Filelock is needed to prevent race condition between signal + # check/removal and signal writing. + with filelock.FileLock(str(signal_file) + '.lock'): + with signal_file.open(mode='w') as f: + # TODO(tian): Probably write a dict instead of bare string + # to the file? It will be helpful for update cases. + f.write(UserSignal.TERMINATE.value) + f.flush() + terminated_service_names.append(service_name) + if len(terminated_service_names) == 0: + return 'No service to terminate.' + identity_str = f'Service with name {terminated_service_names[0]} is' + if len(terminated_service_names) > 1: + terminated_service_names_str = ', '.join(terminated_service_names) + identity_str = f'Services with names {terminated_service_names_str} are' + return f'{identity_str} scheduled to be terminated.' def check_service_status_healthy(service_name: str) -> Optional[str]: @@ -632,29 +533,6 @@ def _service_is_terminal() -> bool: return '' -def wait_for_load_balancer_port(service_name: str) -> str: - # Sleep for a while to bootstrap the load balancer. - time.sleep(5) - for _ in range(constants.SERVICE_PORT_SELECTION_TIMEOUT): - try: - latest_info = get_latest_info(service_name, with_replica_info=False) - except ValueError: - # Service is not created yet. - time.sleep(1) - continue - load_balancer_port = latest_info['load_balancer_port'] - if load_balancer_port is not None: - return common_utils.encode_payload(load_balancer_port) - time.sleep(1) - raise RuntimeError( - f'Failed to get load balancer port for service {service_name!r}: ' - f'timeout after {constants.SERVICE_PORT_SELECTION_TIMEOUT} seconds.') - - -def decode_load_balancer_port(payload: str) -> str: - return common_utils.decode_payload(payload) - - class ServeCodeGen: """Code generator for SkyServe. @@ -667,16 +545,19 @@ class ServeCodeGen: ] @classmethod - def get_latest_info(cls, service_name: str) -> str: + def get_latest_info(cls, service_names: Optional[List[str]]) -> str: code = [ - f'msg = serve_utils.get_latest_info_encoded({service_name!r})', + f'msg = serve_utils.get_latest_info_encoded({service_names!r})', 'print(msg, end="", flush=True)' ] return cls._build(code) @classmethod - def terminate_service(cls, service_name: str) -> str: - code = [f'serve_utils.terminate_service({service_name!r})'] + def terminate_services(cls, service_names: Optional[List[str]]) -> str: + code = [ + f'msg = serve_utils.terminate_services({service_names!r})', + 'print(msg, end="", flush=True)' + ] return cls._build(code) @classmethod @@ -702,14 +583,6 @@ def stream_serve_process_logs(cls, service_name: str, ] return cls._build(code) - @classmethod - def wait_for_load_balancer_port(cls, service_name: str) -> str: - code = [ - f'msg = serve_utils.wait_for_load_balancer_port({service_name!r})', - 'print(msg, flush=True)' - ] - return cls._build(code) - @classmethod def _build(cls, code: List[str]) -> str: code = cls._PREFIX + code diff --git a/sky/serve/service.py b/sky/serve/service.py index d2a1f5cc52c..124d84dee90 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -118,16 +118,30 @@ def _start(service_name: str, task_yaml: str, job_id: int): if isinstance(config, dict): resources_config = config.get('resources') requested_resources = resources.Resources.from_yaml_config(resources_config) + status = serve_state.ServiceStatus.CONTROLLER_INIT + if len(serve_state.get_services()) >= serve_utils.NUM_SERVICE_THRESHOLD: + status = serve_state.ServiceStatus.PENDING + # TODO(tian): Use id as identifier instead of name. serve_state.add_service(service_name, controller_job_id=job_id, policy=service_spec.policy_str(), auto_restart=service_spec.auto_restart, - requested_resources=requested_resources) + requested_resources=requested_resources, + status=status) controller_process = None load_balancer_process = None try: - _handle_signal(service_name) + # Wait until there is a service slot available. + while True: + _handle_signal(service_name) + # Use <= here since we already add this service to database. + if (len(serve_state.get_services()) <= + serve_utils.NUM_SERVICE_THRESHOLD): + serve_state.set_service_status( + service_name, serve_state.ServiceStatus.CONTROLLER_INIT) + break + time.sleep(1) with filelock.FileLock( os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)): diff --git a/sky/spot/__init__.py b/sky/spot/__init__.py index eab619dcee9..7c4cdb989aa 100644 --- a/sky/spot/__init__.py +++ b/sky/spot/__init__.py @@ -11,7 +11,6 @@ from sky.spot.spot_utils import dump_job_table_cache from sky.spot.spot_utils import dump_spot_job_queue from sky.spot.spot_utils import format_job_table -from sky.spot.spot_utils import is_spot_controller_up from sky.spot.spot_utils import load_job_table_cache from sky.spot.spot_utils import load_spot_job_queue from sky.spot.spot_utils import SPOT_CONTROLLER_NAME @@ -33,7 +32,6 @@ 'dump_job_table_cache', 'load_job_table_cache', 'format_job_table', - 'is_spot_controller_up', 'dump_spot_job_queue', 'load_spot_job_queue', ] diff --git a/sky/spot/spot_utils.py b/sky/spot/spot_utils.py index dd400e79573..3b252374a28 100644 --- a/sky/spot/spot_utils.py +++ b/sky/spot/spot_utils.py @@ -17,7 +17,6 @@ from sky import exceptions from sky import global_user_state from sky import sky_logging -from sky import status_lib from sky.backends import backend_utils from sky.skylet import constants from sky.skylet import job_lib @@ -768,68 +767,3 @@ def load_job_table_cache() -> Optional[Tuple[float, str]]: return None with cache_file.open('r') as f: return json.load(f) - - -def is_spot_controller_up( - stopped_message: str, - non_existent_message: str = 'No managed spot jobs are found.', -) -> Tuple[Optional[status_lib.ClusterStatus], - Optional['backends.CloudVmRayResourceHandle']]: - """Check if the spot controller is up. - - It can be used to check the actual controller status (since the autostop is - set for the controller) before the spot commands interact with the - controller. - - Args: - stopped_message: Message to print if the controller is STOPPED. - non_existent_message: Message to show if the controller does not exist. - - Returns: - controller_status: The status of the spot controller. If it fails during - refreshing the status, it will be the cached status. None if the - controller does not exist. - handle: The ResourceHandle of the spot controller. None if the - controller is not UP or does not exist. - - Raises: - exceptions.ClusterOwnerIdentityMismatchError: if the current user is not - the same as the user who created the cluster. - exceptions.CloudUserIdentityError: if we fail to get the current user - identity. - """ - try: - # Set force_refresh_statuses=None to make sure the refresh only happens - # when the controller is INIT/UP (triggered in these statuses as the - # autostop is always set for spot controller). This optimization avoids - # unnecessary costly refresh when the controller is already stopped. - # This optimization is based on the assumption that the user will not - # start the controller manually from the cloud console. - controller_status, handle = backend_utils.refresh_cluster_status_handle( - SPOT_CONTROLLER_NAME, force_refresh_statuses=None) - except exceptions.ClusterStatusFetchingError as e: - # We do not catch the exceptions related to the cluster owner identity - # mismatch, please refer to the comment in - # `backend_utils.check_cluster_available`. - logger.warning( - f'Failed to get the status of the spot controller. ' - 'It is not fatal, but spot commands/calls may hang or return stale ' - 'information, when the controller is not up.\n' - f' Details: {common_utils.format_exception(e, use_bracket=True)}') - record = global_user_state.get_cluster_from_name(SPOT_CONTROLLER_NAME) - controller_status, handle = None, None - if record is not None: - controller_status, handle = record['status'], record['handle'] - - if controller_status is None: - sky_logging.print(non_existent_message) - elif controller_status != status_lib.ClusterStatus.UP: - msg = (f'Spot controller {SPOT_CONTROLLER_NAME} ' - f'is {controller_status.value}.') - if controller_status == status_lib.ClusterStatus.STOPPED: - msg += f'\n{stopped_message}' - if controller_status == status_lib.ClusterStatus.INIT: - msg += '\nPlease wait for the controller to be ready.' - sky_logging.print(msg) - handle = None - return controller_status, handle diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index be940da869d..bf7dba64607 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -1,12 +1,12 @@ """Utilities for sky status.""" import re -import typing from typing import Any, Callable, Dict, List, Optional import click import colorama from sky import backends +from sky import global_user_state from sky import serve from sky import spot from sky import status_lib @@ -14,9 +14,6 @@ from sky.utils import common_utils from sky.utils import log_utils -if typing.TYPE_CHECKING: - import sky - COMMAND_TRUNC_LENGTH = 25 REPLICA_TRUNC_NUM = 10 NUM_COST_REPORT_LINES = 5 @@ -124,10 +121,7 @@ def show_service_table(service_records: List[_ServiceRecord], show_all: bool): StatusColumn('UPTIME', _get_uptime), StatusColumn('STATUS', _get_service_status_colored), StatusColumn('REPLICAS', _get_replicas), - StatusColumn('CONTROLLER_NAME', - _get_controller_name, - show_by_default=False), - StatusColumn('ENDPOINT', _get_endpoint), + StatusColumn('ENDPOINT', get_endpoint), StatusColumn('POLICY', _get_policy, show_by_default=False), StatusColumn('REQUESTED_RESOURCES', _get_requested_resources, @@ -391,7 +385,6 @@ def show_local_status_table(local_clusters: List[str]): 0, cluster_record['duration'], absolute=True)) _get_replica_id = lambda replica_record: replica_record['replica_id'] _get_service_name = lambda replica_record: replica_record['service_name'] -_get_controller_name = lambda replica_record: replica_record['controller_name'] _get_policy = lambda replica_record: replica_record['policy'] _get_requested_resources = lambda replica_record: replica_record[ 'requested_resources'] @@ -416,11 +409,17 @@ def _get_replicas(service_record: _ServiceRecord) -> str: return f'{ready_replica_num}/{total_replica_num}' -def _get_endpoint(service_record: _ServiceRecord) -> str: - endpoint = service_record['endpoint'] - if endpoint is None: +def get_endpoint(service_record: _ServiceRecord) -> str: + # Don't use backend_utils.is_controller_up since it is too slow. + handle = global_user_state.get_handle_from_cluster_name( + serve.SKY_SERVE_CONTROLLER_NAME) + assert isinstance(handle, backends.CloudVmRayResourceHandle) + if handle is None or handle.head_ip is None: + return '-' + load_balancer_port = service_record['load_balancer_port'] + if load_balancer_port is None: return '-' - return endpoint + return f'{handle.head_ip}:{load_balancer_port}' def _get_service_status(service_record: _ServiceRecord) -> serve.ServiceStatus: From 6d590b43f7f032043ffa836a3ac3675d6c78c628 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 21 Oct 2023 13:58:59 -0700 Subject: [PATCH 140/223] minor --- sky/backends/backend_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 63c38f64724..fb1e8e6ccd5 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -145,10 +145,11 @@ class ReservedClusterGroup(enum.Enum): @classmethod def get_group(cls, name: Optional[str]) -> Optional['ReservedClusterGroup']: - """Get the reserved group of a cluster with its name + """Get the reserved group of a cluster with its name. - Returns the group name if the cluster name is reserved. Otherwise, - returns None. + Returns: + the group name if the cluster name is reserved. Otherwise, + returns None. """ if name is None: return None From 8961e5948a60023c133dcaf444f995970d5ae641 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sat, 21 Oct 2023 14:34:31 -0700 Subject: [PATCH 141/223] Apply suggestions from code review Co-authored-by: Zongheng Yang --- sky/cli.py | 51 ++++++++++++++++++++++++--------------------------- sky/core.py | 2 +- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 4543f3ed4ce..06fb937dd18 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -113,7 +113,7 @@ '{cluster_num} cluster{plural} {verb}. Please specify an existing ' 'cluster to show its IP address.\nUsage: `sky status --ip `') -_DAG_NOT_SUPPORT_MESSAGE = ('YAML specifies a DAG which is only supported by ' +_DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by ' '`sky spot launch`. `{command}` supports a ' 'single task only.') @@ -2559,7 +2559,7 @@ def start( if reserved and non_reserved: # Keep this behavior the same as _down_or_stop_clusters(). raise click.UsageError( - 'Starting skypilot controllers with other cluster(s) ' + 'Starting controllers with other cluster(s) ' 'is currently not supported.\n' 'Please start the former independently.') if reserved: @@ -4074,7 +4074,7 @@ def serve(): default=None, type=str, help='A service name. Unique for each service. If not provided, ' - 'provision a new service with an autogenerated name.') + 'a unique name is autogenerated.') @click.option('--yes', '-y', is_flag=True, @@ -4112,13 +4112,13 @@ def serve_up( if task.service is None: with ux_utils.print_exception_no_traceback(): - raise ValueError('Service section not found in the YAML file.') + raise ValueError('Service section not found in the YAML file. To fix, add a valid `service` field.') assert len(task.resources) == 1 requested_resources = list(task.resources)[0] if requested_resources.ports is not None: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Specifying ports in resources is not allowed. SkyServe will ' + 'Specifying ports in resources is not allowed. Each replica will ' 'use the port specified in the service section.') click.secho('Service Spec:', fg='cyan') @@ -4154,9 +4154,9 @@ def serve_up( @usage_lib.entrypoint # pylint: disable=redefined-builtin def serve_status(all: bool, endpoint: bool, service_names: List[str]): - """Show statuses of SkyServe service. + """Show statuses of SkyServe services. - Show detailed statuses of the service. If SERVICE_NAME is not provided, + Show detailed statuses of one or more services. If SERVICE_NAME is not provided, show all services' status. If --endpoint is specified, output the endpoint of the service only. @@ -4164,17 +4164,16 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): - ``CONTROLLER_INIT``: The controller is initializing. - - ``REPLICA_INIT``: The controller provisioning have succeeded; controller - and load balancer process is alive, and there are no available replicas + - ``REPLICA_INIT``: The controller provisioning has succeeded; controller + and load balancer processes are alive, and there are no available replicas for now. This also indicates that no replica failure has been detected. - - ``CONTROLLER_FAILED``: The controller failed to start or in an abnormal - state; or the controller and load balancer process is not alive. + - ``CONTROLLER_FAILED``: The controller failed to start or is in an abnormal + state; or the controller and load balancer processes are not alive. - - ``READY``: The controller is ready to serve requests. This means that - at least one replica have passed the readiness probe. + - ``READY``: The service is ready to serve requests. At least one replica is in READY state (i.e., has passed the readiness probe). - - ``SHUTTING_DOWN``: The controller is being shutting down. This usually + - ``SHUTTING_DOWN``: The controller is being shut down. This usually happens when the `sky serve down` command is called. - ``FAILED``: At least one replica failed and no replica is ready. This @@ -4192,14 +4191,13 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): - ``PROVISIONING``: The replica is being provisioned. - - ``STARTING``: The replica provisioning have succeeded and the replica is - initializing its service, e.g., installing dependencies or loading model + - ``STARTING``: Replica provisioning has succeeded and the replica is + initializing, e.g., installing dependencies or loading model weights. - - ``READY``: The replica is ready to serve requests. + - ``READY``: The replica is ready to serve requests (i.e., has passed the readiness probe). - - ``NOT_READY``: Currently, this replica failed the readiness probe but not - continuously failed for some time. This usually happens when the replica + - ``NOT_READY``: The replica failed a readiness probe, but has not failed the probe for a continuous period of time (otherwise it'd be marked as XXXX). This usually happens when the replica is suffering from a bad network connection or there are too many requests overwhelming the replica. @@ -4211,10 +4209,10 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): This indicates that the replica is already shut down. (Otherwise, it is ``SHUTTING_DOWN``.) - - ``FAILED_CLEANUP``: Some error occurred when the replica is shutting down. - This usually indicates some resources leakage happened since the - termination not finished correctly. When seeing this status, please login - to cloud console and check whether there are some resources not released. + - ``FAILED_CLEANUP``: Some error occurred while the replica was being shut down. + This usually indicates resource leakages since the + termination did not finish correctly. When seeing this status, please login + to the cloud console and check whether there are some leaked VMs/resources. Examples: @@ -4267,7 +4265,7 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): raise click.UsageError( f'{service_num} service{plural} found. Please specify an' ' existing service to show its endpoint. Usage: ' - '`sky serve status --endpoint `') + 'sky serve status --endpoint ') click.echo(status_utils.get_endpoint(service_records[0])) return click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' @@ -4289,7 +4287,7 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): '-a', default=False, is_flag=True, - help='Stop all existing clusters.') + help='Tear down all services.') @click.option('--yes', '-y', is_flag=True, @@ -4303,8 +4301,7 @@ def serve_down(service_names: List[str], all: bool, yes: bool): SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If both SERVICE_NAMES and ``--all`` are supplied, the latter takes precedence. - Tear down a service will delete all associated resources, including the - controller VM and all replicas. + Tearing down a service will delete all of its replicas and associated resources. Example: diff --git a/sky/core.py b/sky/core.py index 6a8766a2658..72315f1da21 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1039,7 +1039,7 @@ def serve_status( sky.cli.serve_status. Args: - service_names: a list of service names to query. If None, query all + service_names: a single or a list of service names to query. If None, query all services. Returns: From 33209197fffeae670b3234b3ae22dd66c18c4a72 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 21 Oct 2023 15:07:19 -0700 Subject: [PATCH 142/223] apply suggestion from code review --- sky/backends/backend_utils.py | 13 +-- sky/backends/cloud_vm_ray_backend.py | 4 +- sky/cli.py | 119 ++++++++++++++------------- sky/core.py | 20 ++--- sky/execution.py | 3 +- sky/serve/__init__.py | 5 +- sky/serve/constants.py | 9 +- sky/serve/controller.py | 2 +- sky/serve/replica_managers.py | 12 +-- sky/serve/serve_state.py | 8 +- sky/serve/serve_utils.py | 2 +- 11 files changed, 102 insertions(+), 95 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index fb1e8e6ccd5..d3a4bdd67eb 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -144,12 +144,13 @@ class ReservedClusterGroup(enum.Enum): 'sky serve controller. ')) @classmethod - def get_group(cls, name: Optional[str]) -> Optional['ReservedClusterGroup']: - """Get the reserved group of a cluster with its name. + def check_cluster_name( + cls, name: Optional[str]) -> Optional['ReservedClusterGroup']: + """Check if the cluster name is reserved. Returns: - the group name if the cluster name is reserved. Otherwise, - returns None. + The group name if the cluster name is reserved. + Otherwise, returns None. """ if name is None: return None @@ -2675,7 +2676,7 @@ def get_clusters( if not include_reserved: records = [ record for record in records - if ReservedClusterGroup.get_group(record['name']) is None + if ReservedClusterGroup.check_cluster_name(record['name']) is None ] yellow = colorama.Fore.YELLOW @@ -2965,7 +2966,7 @@ def check_cluster_name_not_reserved( Returns: None, if the cluster name is not reserved. """ - group = ReservedClusterGroup.get_group(cluster_name) + group = ReservedClusterGroup.check_cluster_name(cluster_name) if group is not None: msg = group.value.check_cluster_name_hint if operation_str is not None: diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 480b1bdaec0..0b47d06b05a 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3373,7 +3373,7 @@ def _exec_code_on_head( self.tail_logs(handle, job_id) finally: name = handle.cluster_name - group = backend_utils.ReservedClusterGroup.get_group(name) + group = backend_utils.ReservedClusterGroup.check_cluster_name(name) if group == backend_utils.ReservedClusterGroup.SPOT_CONTROLLER: logger.info( f'{fore.CYAN}Spot Job ID: ' @@ -3542,7 +3542,7 @@ def _post_execute(self, handle: CloudVmRayResourceHandle, fore = colorama.Fore style = colorama.Style name = handle.cluster_name - group = backend_utils.ReservedClusterGroup.get_group(name) + group = backend_utils.ReservedClusterGroup.check_cluster_name(name) if group is not None or down: return stop_str = ('\nTo stop the cluster:' diff --git a/sky/cli.py b/sky/cli.py index 06fb937dd18..ee7359ee1a6 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -114,8 +114,8 @@ 'cluster to show its IP address.\nUsage: `sky status --ip `') _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by ' - '`sky spot launch`. `{command}` supports a ' - 'single task only.') + '`sky spot launch`. `{command}` supports a ' + 'single task only.') def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: @@ -1431,7 +1431,7 @@ def launch( ) if isinstance(task_or_dag, sky.Dag): raise click.UsageError( - _DAG_NOT_SUPPORT_MESSAGE.format(command='sky launch')) + _DAG_NOT_SUPPORTED_MESSAGE.format(command='sky launch')) task = task_or_dag backend: backends.Backend @@ -1815,7 +1815,8 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, reserved_clusters = [] for cluster_record in cluster_records: cluster_name = cluster_record['name'] - group = backend_utils.ReservedClusterGroup.get_group(cluster_name) + group = backend_utils.ReservedClusterGroup.check_cluster_name( + cluster_name) if group is not None: reserved_clusters.append(cluster_record) hints.append(group.value.sky_status_hint) @@ -1921,7 +1922,8 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin reserved_clusters = dict() for cluster_record in cluster_records: cluster_name = cluster_record['name'] - group = backend_utils.ReservedClusterGroup.get_group(cluster_name) + group = backend_utils.ReservedClusterGroup.check_cluster_name( + cluster_name) if group is not None: cluster_group_name = group.value.group_name # to display most recent entry for each reserved cluster @@ -2189,7 +2191,7 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) except exceptions.NotSupportedError: - group = backend_utils.ReservedClusterGroup.get_group(cluster) + group = backend_utils.ReservedClusterGroup.check_cluster_name(cluster) assert group is not None, cluster click.echo(group.value.decline_cancel_hint) sys.exit(1) @@ -2483,8 +2485,8 @@ def start( clusters = [ cluster['name'] for cluster in global_user_state.get_clusters() - if backend_utils.ReservedClusterGroup.get_group(cluster['name']) is - None + if backend_utils.ReservedClusterGroup.check_cluster_name( + cluster['name']) is None ] if not clusters: @@ -2552,16 +2554,16 @@ def start( # Checks for reserved clusters (spot controller). reserved, non_reserved = [], [] for name in to_start: - if backend_utils.ReservedClusterGroup.get_group(name) is not None: + if backend_utils.ReservedClusterGroup.check_cluster_name( + name) is not None: reserved.append(name) else: non_reserved.append(name) if reserved and non_reserved: # Keep this behavior the same as _down_or_stop_clusters(). - raise click.UsageError( - 'Starting controllers with other cluster(s) ' - 'is currently not supported.\n' - 'Please start the former independently.') + raise click.UsageError('Starting controllers with other cluster(s) ' + 'is currently not supported.\n' + 'Please start the former independently.') if reserved: bold = backend_utils.BOLD reset_bold = backend_utils.RESET_BOLD @@ -2818,12 +2820,13 @@ def _down_or_stop_clusters( if len(names) > 0: reserved_clusters = [ name for name in names - if backend_utils.ReservedClusterGroup.get_group(name) is not None + if backend_utils.ReservedClusterGroup.check_cluster_name(name) + is not None ] reserved_clusters_str = ', '.join(map(repr, reserved_clusters)) names = [ - name for name in _get_glob_clusters(names) - if backend_utils.ReservedClusterGroup.get_group(name) is None + name for name in _get_glob_clusters(names) if + backend_utils.ReservedClusterGroup.check_cluster_name(name) is None ] if not down: local_clusters = onprem_utils.check_and_get_local_clusters() @@ -2857,17 +2860,16 @@ def _down_or_stop_clusters( f'{operation} reserved cluster(s) ' f'{reserved_clusters_str} is currently not supported.') else: - reserved_group = backend_utils.ReservedClusterGroup.get_group( - reserved_cluster) + reserved_group = (backend_utils.ReservedClusterGroup. + check_cluster_name(reserved_cluster)) assert reserved_group is not None hint_or_raise = _RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE[ reserved_group] hint_or_raise(reserved_cluster) confirm_str = 'delete' user_input = click.prompt( - f'To proceed, please check the information above and type ' - f'{colorama.Style.BRIGHT}{confirm_str!r}' - f'{colorama.Style.RESET_ALL}', + f'To proceed, please type {colorama.Style.BRIGHT}' + f'{confirm_str!r}{colorama.Style.RESET_ALL}', type=str) if user_input != confirm_str: raise click.Abort() @@ -2884,8 +2886,10 @@ def _down_or_stop_clusters( # Otherwise, it would be very easy to accidentally delete a reserved # cluster. names = [ - record['name'] for record in all_clusters if - backend_utils.ReservedClusterGroup.get_group(record['name']) is None + record['name'] + for record in all_clusters + if backend_utils.ReservedClusterGroup.check_cluster_name( + record['name']) is None ] clusters = [] @@ -4064,7 +4068,7 @@ def serve(): @serve.command('up', cls=_DocumentedCodeCommand) -@click.argument('entrypoint', +@click.argument('service_yaml', required=True, type=str, nargs=-1, @@ -4083,13 +4087,13 @@ def serve(): help='Skip confirmation prompt.') # TODO(tian): Support the task_option overrides for the service. def serve_up( - entrypoint: List[str], + service_yaml: List[str], service_name: Optional[str], yes: bool, ): """Launch a SkyServe service. - ENTRYPOINT must point to a valid YAML file. + SERVICE_YAML must point to a valid YAML file. Example: @@ -4100,26 +4104,27 @@ def serve_up( if service_name is None: service_name = backend_utils.generate_service_name() - is_yaml, _ = _check_yaml(''.join(entrypoint)) + is_yaml, _ = _check_yaml(''.join(service_yaml)) if not is_yaml: - raise click.UsageError( - 'For `sky serve up`, the entrypoint must be a YAML file.') + raise click.UsageError('SERVICE_YAML must be a valid YAML file.') + # We keep nargs=-1 in service_yaml argument to reuse this function. task = _make_task_or_dag_from_entrypoint_with_overrides( - entrypoint, entrypoint_name='Service') + service_yaml, entrypoint_name='Service') if isinstance(task, sky.Dag): raise click.UsageError( - _DAG_NOT_SUPPORT_MESSAGE.format(command='sky serve up')) + _DAG_NOT_SUPPORTED_MESSAGE.format(command='sky serve up')) if task.service is None: with ux_utils.print_exception_no_traceback(): - raise ValueError('Service section not found in the YAML file. To fix, add a valid `service` field.') + raise ValueError('Service section not found in the YAML file. ' + 'To fix, add a valid `service` field.') assert len(task.resources) == 1 requested_resources = list(task.resources)[0] if requested_resources.ports is not None: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Specifying ports in resources is not allowed. Each replica will ' - 'use the port specified in the service section.') + 'Specifying ports in resources is not allowed. Each replica ' + 'will use the port specified in the service section.') click.secho('Service Spec:', fg='cyan') click.echo(task.service) @@ -4156,25 +4161,26 @@ def serve_up( def serve_status(all: bool, endpoint: bool, service_names: List[str]): """Show statuses of SkyServe services. - Show detailed statuses of one or more services. If SERVICE_NAME is not provided, - show all services' status. If --endpoint is specified, output the endpoint - of the service only. + Show detailed statuses of one or more services. If SERVICE_NAME is not + provided, show all services' status. If --endpoint is specified, output + the endpoint of the service only. Each service can have one of the following statuses: - ``CONTROLLER_INIT``: The controller is initializing. - - ``REPLICA_INIT``: The controller provisioning has succeeded; controller - and load balancer processes are alive, and there are no available replicas - for now. This also indicates that no replica failure has been detected. + - ``REPLICA_INIT``: The controller has finished initializing, and there are + no available replicas for now. This also indicates that no replica failure + has been detected. - ``CONTROLLER_FAILED``: The controller failed to start or is in an abnormal state; or the controller and load balancer processes are not alive. - - ``READY``: The service is ready to serve requests. At least one replica is in READY state (i.e., has passed the readiness probe). + - ``READY``: The service is ready to serve requests. At least one replica is + in READY state (i.e., has passed the readiness probe). - - ``SHUTTING_DOWN``: The controller is being shut down. This usually - happens when the `sky serve down` command is called. + - ``SHUTTING_DOWN``: The service is being shut down. This usually happens + when the `sky serve down` command is called. - ``FAILED``: At least one replica failed and no replica is ready. This could be caused by several reasons: @@ -4195,24 +4201,26 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): initializing, e.g., installing dependencies or loading model weights. - - ``READY``: The replica is ready to serve requests (i.e., has passed the readiness probe). + - ``READY``: The replica is ready to serve requests (i.e., has passed the + readiness probe). - - ``NOT_READY``: The replica failed a readiness probe, but has not failed the probe for a continuous period of time (otherwise it'd be marked as XXXX). This usually happens when the replica - is suffering from a bad network connection or there are too many requests - overwhelming the replica. + - ``NOT_READY``: The replica failed a readiness probe, but has not failed + the probe for a continuous period of time (otherwise it'd be shut down). + This usually happens when the replica is suffering from a bad network + connection or there are too many requests overwhelming the replica. - - ``SHUTTING_DOWN``: The replica is being shutting down. This usually - happens when the replica is being scaled down or some error occurred. - SkyServe will terminate all replicas that have some error occurred. + - ``SHUTTING_DOWN``: The replica is being shut down. This usually happens + when the replica is being scaled down or some error occurred. SkyServe + will terminate all replicas that errored. - ``FAILED``: Some error occurred when the replica is serving requests. This indicates that the replica is already shut down. (Otherwise, it is ``SHUTTING_DOWN``.) - - ``FAILED_CLEANUP``: Some error occurred while the replica was being shut down. - This usually indicates resource leakages since the - termination did not finish correctly. When seeing this status, please login - to the cloud console and check whether there are some leaked VMs/resources. + - ``FAILED_CLEANUP``: Some error occurred while the replica was being shut + down. This usually indicates resource leakages since the termination + did not finish correctly. When seeing this status, please login to the + cloud console and check whether there are some leaked VMs/resources. Examples: @@ -4301,7 +4309,8 @@ def serve_down(service_names: List[str], all: bool, yes: bool): SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If both SERVICE_NAMES and ``--all`` are supplied, the latter takes precedence. - Tearing down a service will delete all of its replicas and associated resources. + Tearing down a service will delete all of its replicas and associated + resources. Example: diff --git a/sky/core.py b/sky/core.py index 72315f1da21..c4a79d5f021 100644 --- a/sky/core.py +++ b/sky/core.py @@ -183,7 +183,8 @@ def _start( f'Starting cluster {cluster_name!r} with backend {backend.NAME} ' 'is not supported.') - if backend_utils.ReservedClusterGroup.get_group(cluster_name) is not None: + if backend_utils.ReservedClusterGroup.check_cluster_name( + cluster_name) is not None: if down: raise ValueError('Using autodown (rather than autostop) is not ' 'supported for skypilot controllers. Pass ' @@ -301,7 +302,8 @@ def stop(cluster_name: str, purge: bool = False) -> None: sky.exceptions.NotSupportedError: if the specified cluster is a spot cluster, or a TPU VM Pod cluster, or the managed spot controller. """ - if backend_utils.ReservedClusterGroup.get_group(cluster_name) is not None: + if backend_utils.ReservedClusterGroup.check_cluster_name( + cluster_name) is not None: raise exceptions.NotSupportedError( f'Stopping sky reserved cluster {cluster_name!r} ' f'is not supported.') @@ -424,7 +426,8 @@ def autostop( if is_cancel: option_str = '{stop,down}' operation = f'{verb} auto{option_str}' - if backend_utils.ReservedClusterGroup.get_group(cluster_name) is not None: + if backend_utils.ReservedClusterGroup.check_cluster_name( + cluster_name) is not None: raise exceptions.NotSupportedError( f'{operation} sky reserved cluster {cluster_name!r} ' f'is not supported.') @@ -1008,11 +1011,7 @@ def serve_status( { 'name': (str) service name, - 'launched_at': (int) timestamp of creation, - 'controller_name': (str) name of the controller cluster of the - service, - 'endpoint': (str) service endpoint, - 'replica_info': (List[Dict[str, Any]]) replica information, + 'controller_job_id': (int) the job id of the controller, 'uptime': (int) uptime in seconds, 'status': (sky.ServiceStatus) service status, 'controller_port': (Optional[int]) controller port, @@ -1022,6 +1021,7 @@ def serve_status( auto-restarted, 'requested_resources': (sky.Resources) requested resources for replica, + 'replica_info': (List[Dict[str, Any]]) replica information, } Each entry in replica_info has the following fields: @@ -1039,8 +1039,8 @@ def serve_status( sky.cli.serve_status. Args: - service_names: a single or a list of service names to query. If None, query all - services. + service_names: a single or a list of service names to query. If None, + query all services. Returns: A list of dicts, with each dict containing the information of a service. diff --git a/sky/execution.py b/sky/execution.py index d099eeaf69d..df583599a2c 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -364,7 +364,8 @@ def _execute( backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: - group = backend_utils.ReservedClusterGroup.get_group(cluster_name) + group = backend_utils.ReservedClusterGroup.check_cluster_name( + cluster_name) if group is None and not _is_launched_by_sky_serve_controller: # UX: print live clusters to make users aware (to save costs). # diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index fd2dc70f666..e140437926d 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -1,15 +1,14 @@ """Modules for SkyServe services.""" import os -from sky.serve.constants import CONTROLLER_FILE_LOCK_PATH from sky.serve.constants import CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP from sky.serve.constants import CONTROLLER_RESOURCES from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.constants import ENDPOINT_PROBE_INTERVAL from sky.serve.constants import LB_CONTROLLER_SYNC_INTERVAL from sky.serve.constants import LOAD_BALANCER_PORT_RANGE -from sky.serve.constants import SERVE_PREFIX from sky.serve.constants import SERVICES_TASK_CPU_DEMAND +from sky.serve.constants import SKYSERVE_METADATA_DIR from sky.serve.serve_state import ReplicaStatus from sky.serve.serve_state import ServiceStatus from sky.serve.serve_utils import generate_remote_controller_log_file_name @@ -21,4 +20,4 @@ from sky.serve.serve_utils import SKY_SERVE_CONTROLLER_NAME from sky.serve.service_spec import SkyServiceSpec -os.makedirs(os.path.expanduser(SERVE_PREFIX), exist_ok=True) +os.makedirs(os.path.expanduser(SKYSERVE_METADATA_DIR), exist_ok=True) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 69497a953f0..ab7b1f793d7 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -4,17 +4,12 @@ CONTROLLER_TEMPLATE = 'sky-serve-controller.yaml.j2' -SERVE_PREFIX = '~/.sky/serve' - -# The filelock for selecting controller and service ports when starting a -# service. In our current multi-service controller implementation, we need to -# select a controller if there are some existing controllers. -CONTROLLER_FILE_LOCK_PATH = f'{SERVE_PREFIX}/controller.lock' +SKYSERVE_METADATA_DIR = '~/.sky/serve' # The filelock for selecting service ports when starting a service. We need to # have a filelock to avoid port collision when starting multiple services at # the same time. -PORT_SELECTION_FILE_LOCK_PATH = f'{SERVE_PREFIX}/port_selection.lock' +PORT_SELECTION_FILE_LOCK_PATH = f'{SKYSERVE_METADATA_DIR}/port_selection.lock' # Signal file path for controller to handle signals. SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}' diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 8b12c841e43..ec6bd2d2c30 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -89,7 +89,7 @@ def load_balancer_sync(request: fastapi.Request): request_information = pickle.loads( base64.b64decode(request_information_payload)) logger.info( - f'Received request information: {request_information!r}') + f'Received inflight request information: {request_information}') if isinstance(self.autoscaler, autoscalers.RequestRateAutoscaler): if not isinstance(request_information, serve_utils.RequestTimestamp): diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index ff243f85c74..fa08cd31f5c 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -316,13 +316,13 @@ def probe( # TODO(tian): Support HTTPS in the future. readiness_path = f'http://{replica_ip}{readiness_suffix}' if post_data is not None: - msg += 'Post' + msg += 'POST' response = requests.post( readiness_path, json=post_data, timeout=serve_constants.READINESS_PROBE_TIMEOUT) else: - msg += 'Get' + msg += 'GET' response = requests.get( readiness_path, timeout=serve_constants.READINESS_PROBE_TIMEOUT) @@ -334,7 +334,7 @@ def probe( msg += f' and response {response.text}.' logger.info(msg) if response.status_code == 200: - logger.info(f'Replica {replica_ip} is ready.') + logger.debug(f'Replica {replica_ip} is ready.') return self, True, probe_time except requests.exceptions.RequestException as e: logger.info(e) @@ -572,7 +572,7 @@ def _refresh_process_pool(self) -> None: def _process_pool_refresher(self) -> None: """Periodically refresh the launch/down process pool.""" while True: - logger.info('Refreshing process pool.') + logger.debug('Refreshing process pool.') try: self._refresh_process_pool() except Exception as e: # pylint: disable=broad-except @@ -617,7 +617,7 @@ def _fetch_job_status(self) -> None: def _job_status_fetcher(self) -> None: """Periodically fetch the service job status of all replicas.""" while True: - logger.info('Refreshing job status.') + logger.debug('Refreshing job status.') try: self._fetch_job_status() except Exception as e: # pylint: disable=broad-except @@ -713,7 +713,7 @@ def _probe_all_replicas(self) -> None: def _replica_prober(self) -> None: """Periodically probe replicas.""" while True: - logger.info('Running replica prober.') + logger.debug('Running replica prober.') try: self._probe_all_replicas() replica_statuses = [ diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 3de99a333e9..7801c33a9f4 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -16,7 +16,7 @@ import sky from sky.serve import replica_managers -_DB_PATH = pathlib.Path(constants.SERVE_PREFIX) / 'services.db' +_DB_PATH = pathlib.Path(constants.SKYSERVE_METADATA_DIR) / 'services.db' _DB_PATH = _DB_PATH.expanduser().absolute() _DB_PATH.parents[0].mkdir(parents=True, exist_ok=True) _DB_PATH = str(_DB_PATH) @@ -73,8 +73,10 @@ class ReplicaStatus(enum.Enum): # The replica VM is once failed and has been deleted. FAILED = 'FAILED' - # `sky.down` failed during service teardown. This could mean resource - # leakage. + # `sky.down` failed during service teardown. + # This could mean resource leakage. + # TODO(tian): This status should be removed in the future, at which point + # we should guarantee no resource leakage like regular sky. FAILED_CLEANUP = 'FAILED_CLEANUP' # Unknown status. This should never happen. diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 58f4f6de40e..1fa3d54cf8a 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -181,7 +181,7 @@ def run(self, *args, **kwargs): def generate_remote_service_dir_name(service_name: str) -> str: service_name = service_name.replace('-', '_') - return os.path.join(constants.SERVE_PREFIX, service_name) + return os.path.join(constants.SKYSERVE_METADATA_DIR, service_name) def generate_remote_task_yaml_file_name(service_name: str) -> str: From 05a9f831f14cfa94594775597c89a52eb3a786e3 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 22 Oct 2023 15:18:22 -0700 Subject: [PATCH 143/223] mske sky status showing service as well --- sky/cli.py | 191 +++++++++++++++++----------- sky/utils/cli_utils/status_utils.py | 34 +++-- 2 files changed, 140 insertions(+), 85 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index ee7359ee1a6..017242403c2 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1665,6 +1665,52 @@ def _get_spot_jobs( return num_in_progress_jobs, msg +def _get_services(service_names: Optional[List[str]], + show_all: bool, + show_endpoint: bool, + is_called_by_user: bool = False) -> str: + msg = None + try: + if not is_called_by_user: + usage_lib.messages.usage.set_internal() + with sky_logging.silent(): + if not service_names: + # Change empty list to None + service_names = None + service_records = core.serve_status(service_names) + except exceptions.ClusterNotUpError as e: + controller_status = e.cluster_status + if controller_status == status_lib.ClusterStatus.INIT: + msg = 'Controller is initializing. Please wait for a while.' + else: + assert controller_status in [None, status_lib.ClusterStatus.STOPPED] + msg = 'No existing services.' + if controller_status is None: + msg += (f' (See: {colorama.Style.BRIGHT}sky serve -h' + f'{colorama.Style.RESET_ALL})') + except RuntimeError as e: + msg = ('Failed to fetch service statuses due to connection issues. ' + 'Please try again later. Details: ' + f'{common_utils.format_exception(e, use_bracket=True)}') + except Exception as e: # pylint: disable=broad-except + msg = ('Failed to fetch service statuses: ' + f'{common_utils.format_exception(e, use_bracket=True)}') + else: + if show_endpoint: + if len(service_records) != 1: + plural = 's' if len(service_records) > 1 else '' + service_num = (str(len(service_records)) + if len(service_records) > 0 else 'No') + raise click.UsageError( + f'{service_num} service{plural} found. Please specify ' + 'an existing service to show its endpoint. Usage: ' + 'sky serve status --endpoint ') + msg = status_utils.get_endpoint(service_records[0]) + else: + msg = status_utils.format_service_table(service_records, show_all) + return msg + + @cli.command() @click.option('--all', '-a', @@ -1693,6 +1739,11 @@ def _get_spot_jobs( is_flag=True, required=False, help='Also show recent in-progress spot jobs, if any.') +@click.option('--show-services/--no-show-services', + default=True, + is_flag=True, + required=False, + help='Also show sky serve services, if any.') @click.argument('clusters', required=False, type=str, @@ -1701,7 +1752,7 @@ def _get_spot_jobs( @usage_lib.entrypoint # pylint: disable=redefined-builtin def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, - clusters: List[str]): + show_services: bool, clusters: List[str]): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Show clusters. @@ -1751,10 +1802,10 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, or for autostop-enabled clusters, use ``--refresh`` to query the latest cluster statuses from the cloud providers. """ - # Using a pool with 1 worker to run the spot job query in parallel to speed - # up. The pool provides a AsyncResult object that can be used as a future. - # TODO(tian): Show service as well. - with multiprocessing.Pool(1) as pool: + # Using a pool with 2 worker to run the spot job query and sky serve service + # query in parallel to speed up. The pool provides a AsyncResult object that + # can be used as a future. + with multiprocessing.Pool(2) as pool: # Do not show spot queue if user specifies clusters, and if user # specifies --ip. show_spot_jobs = show_spot_jobs and not clusters and not ip @@ -1767,6 +1818,16 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, show_all=False, limit_num_jobs_to_show=not all, is_called_by_user=False)) + show_services = show_services and not clusters and not ip + if show_services: + # Run the sky serve service query in parallel to speed up the + # status query. + services_future = pool.apply_async(_get_services, + kwds=dict( + service_names=None, + show_all=False, + show_endpoint=False, + is_called_by_user=False)) if ip: if len(clusters) != 1: with ux_utils.print_exception_no_traceback(): @@ -1830,31 +1891,42 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, nonreserved_cluster_records + reserved_clusters, all) status_utils.show_local_status_table(local_clusters) + def _try_get_future_result(future) -> Tuple[bool, Any]: + result = None + success = True + try: + result = future.get() + except KeyboardInterrupt: + pool.terminate() + # Set to -1, so that the controller is not considered + # down, and the hint for showing sky spot queue + # will still be shown. + success = False + + try: + pool.close() + pool.join() + except SystemExit as e: + # This is to avoid a "Exception ignored" problem caused by + # ray worker setting the sigterm handler to sys.exit(15) + # (see ray/_private/worker.py). + # TODO (zhwu): Remove any importing of ray in SkyPilot. + if e.code != 15: + raise + return success, result + if show_spot_jobs: click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Managed spot jobs{colorama.Style.RESET_ALL}') with rich_utils.safe_status('[cyan]Checking spot jobs[/]'): - try: - num_in_progress_jobs, msg = spot_jobs_future.get() - except KeyboardInterrupt: - pool.terminate() - # Set to -1, so that the controller is not considered - # down, and the hint for showing sky spot queue - # will still be shown. + spot_jobs_success, result = _try_get_future_result( + spot_jobs_future) + if spot_jobs_success: + num_in_progress_jobs, msg = result + else: num_in_progress_jobs = -1 msg = 'KeyboardInterrupt' - try: - pool.close() - pool.join() - except SystemExit as e: - # This is to avoid a "Exception ignored" problem caused by - # ray worker setting the sigterm handler to sys.exit(15) - # (see ray/_private/worker.py). - # TODO (zhwu): Remove any importing of ray in SkyPilot. - if e.code != 15: - raise - click.echo(msg) if num_in_progress_jobs is not None: # spot controller is UP. @@ -1875,6 +1947,20 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, f'* {job_info}To see all spot jobs: {colorama.Style.BRIGHT}' f'sky spot queue{colorama.Style.RESET_ALL}') + if show_services: + click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Services{colorama.Style.RESET_ALL}') + if not spot_jobs_success: + # The pool is terminated, so we cannot run the service query. + click.secho('Failed to query services. Please try again later.', + fg='yellow') + else: + with rich_utils.safe_status('[cyan]Checking services[/]'): + success, msg = _try_get_future_result(services_future) + if not success: + msg = 'KeyboardInterrupt' + click.echo(msg) + if num_pending_autostop > 0 and not refresh: # Don't print this hint if there's no pending autostop or user has # already passed --refresh. @@ -4237,56 +4323,15 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): """ # This won't pollute the output of --endpoint. with rich_utils.safe_status('[cyan]Checking services[/]'): - # TODO(tian): Add this to `sky status` as well. - msg = None - try: - with sky_logging.silent(): - if not service_names: - service_records = core.serve_status(None) - else: - service_records = core.serve_status(service_names) - except exceptions.ClusterNotUpError as e: - controller_status = e.cluster_status - if controller_status == status_lib.ClusterStatus.INIT: - msg = 'Controller is initializing. Please wait for a while.' - else: - assert controller_status in [ - None, status_lib.ClusterStatus.STOPPED - ] - msg = 'No existing services.' - except RuntimeError as e: - msg = ('Failed to fetch service statuses due to connection issues. ' - 'Please try again later. Details: ' - f'{common_utils.format_exception(e, use_bracket=True)}') - except Exception as e: # pylint: disable=broad-except - msg = ('Failed to fetch service statuses: ' - f'{common_utils.format_exception(e, use_bracket=True)}') - if msg is not None: - click.echo(msg) - return - - if endpoint: - if len(service_records) != 1: - plural = 's' if len(service_names) > 1 else '' - service_num = (str(len(service_names)) - if len(service_names) > 0 else 'No') - raise click.UsageError( - f'{service_num} service{plural} found. Please specify an' - ' existing service to show its endpoint. Usage: ' - 'sky serve status --endpoint ') - click.echo(status_utils.get_endpoint(service_records[0])) - return - click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Services' - f'{colorama.Style.RESET_ALL}') - status_utils.show_service_table(service_records, all) - click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Replicas{colorama.Style.RESET_ALL}') - replica_infos = [] - for service_record in service_records: - for replica_record in service_record['replica_info']: - replica_record['service_name'] = service_record['name'] - replica_infos.append(replica_record) - status_utils.show_replica_table(replica_infos, all) + msg = _get_services(service_names, + show_all=all, + show_endpoint=endpoint, + is_called_by_user=True) + + if not endpoint: + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Services{colorama.Style.RESET_ALL}') + click.echo(msg) @serve.command('down', cls=_DocumentedCodeCommand) diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index bf7dba64607..620b54549f9 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -114,10 +114,13 @@ def show_status_table(cluster_records: List[_ClusterRecord], return num_pending_autostop -def show_service_table(service_records: List[_ServiceRecord], show_all: bool): +def format_service_table(service_records: List[_ServiceRecord], + show_all: bool) -> str: + if not service_records: + return 'No existing services.' + status_columns = [ StatusColumn('NAME', _get_name), - StatusColumn('LAUNCHED', _get_launched, show_by_default=False), StatusColumn('UPTIME', _get_uptime), StatusColumn('STATUS', _get_service_status_colored), StatusColumn('REPLICAS', _get_replicas), @@ -133,19 +136,29 @@ def show_service_table(service_records: List[_ServiceRecord], show_all: bool): if status_column.show_by_default or show_all: columns.append(status_column.name) service_table = log_utils.create_table(columns) + replica_infos = [] for record in service_records: row = [] for status_column in status_columns: if status_column.show_by_default or show_all: row.append(status_column.calc(record)) service_table.add_row(row) - if service_records: - click.echo(service_table) - else: - click.echo('No existing services.') + for replica in record['replica_info']: + replica['service_name'] = record['name'] + replica_infos.append(replica) + + replica_table = format_replica_table(replica_infos, show_all) + return (f'{service_table}\n' + f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Replicas{colorama.Style.RESET_ALL}\n' + f'{replica_table}') -def show_replica_table(replica_records: List[_ReplicaRecord], show_all: bool): +def format_replica_table(replica_records: List[_ReplicaRecord], + show_all: bool) -> str: + if not replica_records: + return 'No existing replicas.' + status_columns = [ StatusColumn('SERVICE_NAME', _get_service_name), StatusColumn('ID', _get_replica_id), @@ -176,11 +189,8 @@ def show_replica_table(replica_records: List[_ReplicaRecord], show_all: bool): if status_column.show_by_default or show_all: row.append(status_column.calc(record)) replica_table.add_row(row) - if replica_records: - click.echo(replica_table) - else: - click.echo('No existing replicas.') - click.echo(truncate_hint, nl=False) + + return f'{replica_table}\n{truncate_hint}' def get_total_cost_of_displayed_records( From e2d103db236be81f643aa3a842a201c18baad324 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 22 Oct 2023 15:27:04 -0700 Subject: [PATCH 144/223] fix --- sky/cli.py | 1 + sky/execution.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 017242403c2..bad605870e9 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1915,6 +1915,7 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: raise return success, result + spot_jobs_success = True if show_spot_jobs: click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Managed spot jobs{colorama.Style.RESET_ALL}') diff --git a/sky/execution.py b/sky/execution.py index df583599a2c..17ace038f7b 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -380,7 +380,8 @@ def _execute( # Disable the usage collection for this status command. env = dict(os.environ, **{env_options.Options.DISABLE_LOGGING.value: '1'}) - subprocess_utils.run('sky status --no-show-spot-jobs', env=env) + subprocess_utils.run( + 'sky status --no-show-spot-jobs --no-show-services', env=env) print() print('\x1b[?25h', end='') # Show cursor. From 0ab0decf2aa28e4c377a22ae7f809f5a09442e9b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 22 Oct 2023 16:18:54 -0700 Subject: [PATCH 145/223] replica manager ux; use sky logger for uvicorn --- sky/serve/controller.py | 15 ++++++------ sky/serve/load_balancer.py | 22 ++++++++++++----- sky/serve/replica_managers.py | 46 +++++++++++++++++++---------------- 3 files changed, 49 insertions(+), 34 deletions(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index ec6bd2d2c30..4ac718ba042 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -102,16 +102,17 @@ def load_balancer_sync(request: fastapi.Request): self.replica_manager.get_ready_replica_ips() } + @self.app.on_event('startup') + def configure_logger(): + uvicorn_access_logger = logging.getLogger('uvicorn.access') + for handler in uvicorn_access_logger.handlers: + handler.setFormatter(sky_logging.FORMATTER) + threading.Thread(target=self._run_autoscaler).start() - # Disable all GET logs if SKYPILOT_DEBUG is not set to avoid overflowing - # the controller logs. - if not env_options.Options.SHOW_DEBUG_INFO.get(): - logging.getLogger('uvicorn.access').addFilter( - SuppressSuccessGetAccessLogsFilter()) + logger.info('SkyServe Controller started on ' + f'http://localhost:{self.port}') - logger.info( - f'SkyServe Controller started on http://localhost:{self.port}') uvicorn.run(self.app, host='localhost', port=self.port) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 449030a1bd4..96b7fa30b39 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -1,5 +1,6 @@ """LoadBalancer: redirect any incoming request to an endpoint replica.""" import base64 +import logging import pickle import threading import time @@ -26,11 +27,16 @@ class SkyServeLoadBalancer: def __init__(self, controller_url: str, load_balancer_port: int, replica_port: int) -> None: + """Initialize the load balancer. + + Args: + controller_url: The URL of the controller. + load_balancer_port: The port where the load balancer listens to. + replica_port: The port where the replica app listens to. + """ self.app = fastapi.FastAPI() self.controller_url = controller_url - # This is the port where the load balancer listens to. self.load_balancer_port = load_balancer_port - # This is the port where the replica app listens to. self.replica_port = replica_port self.load_balancing_policy: lb_policies.LoadBalancingPolicy = ( lb_policies.RoundRobinPolicy()) @@ -63,7 +69,7 @@ def _sync_with_controller(self): # Clean up after reporting request information to avoid OOM. self.request_information.clear() response.raise_for_status() - ready_replica_ips = response.json()['ready_replica_ips'] + ready_replica_ips = response.json().get('ready_replica_ips') except requests.RequestException as e: print(f'An error occurred: {e}') else: @@ -91,9 +97,13 @@ def run(self): self._redirect_handler, methods=['GET', 'POST', 'PUT', 'DELETE']) - sync_controller_thread = threading.Thread( - target=self._sync_with_controller, daemon=True) - sync_controller_thread.start() + @self.app.on_event('startup') + def configure_logger(): + uvicorn_access_logger = logging.getLogger('uvicorn.access') + for handler in uvicorn_access_logger.handlers: + handler.setFormatter(sky_logging.FORMATTER) + + threading.Thread(target=self._sync_with_controller, daemon=True).start() logger.info('SkyServe Load Balancer started on ' f'http://0.0.0.0:{self.load_balancer_port}') diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index fa08cd31f5c..d75829b0176 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -285,8 +285,8 @@ def ip(self) -> Optional[str]: def status(self) -> serve_state.ReplicaStatus: replica_status = self.status_property.to_replica_status() if replica_status == serve_state.ReplicaStatus.UNKNOWN: - logger.error('Detecting UNKNOWN replica status for cluster ' - f'{self.cluster_name}') + logger.error('Detecting UNKNOWN replica status for ' + f'replica {self.replica_id}.') return replica_status def to_info_dict(self, with_handle: bool) -> Dict[str, Any]: @@ -309,12 +309,12 @@ def probe( Returns: Tuple of (self, is_ready, probe_time). """ - replica_ip = self.ip + replica_identity = f'replica {self.replica_id} with ip {self.ip}' probe_time = time.time() try: msg = '' # TODO(tian): Support HTTPS in the future. - readiness_path = f'http://{replica_ip}{readiness_suffix}' + readiness_path = f'http://{self.ip}{readiness_suffix}' if post_data is not None: msg += 'POST' response = requests.post( @@ -326,19 +326,19 @@ def probe( response = requests.get( readiness_path, timeout=serve_constants.READINESS_PROBE_TIMEOUT) - msg += (f' request to {replica_ip} returned status code ' - f'{response.status_code}') + msg += (f' request to {replica_identity} returned status ' + f'code {response.status_code}') if response.status_code == 200: msg += '.' else: msg += f' and response {response.text}.' logger.info(msg) if response.status_code == 200: - logger.debug(f'Replica {replica_ip} is ready.') + logger.debug(f'{replica_identity.capitalize()} is ready.') return self, True, probe_time except requests.exceptions.RequestException as e: logger.info(e) - logger.info(f'Replica {replica_ip} is not ready.') + logger.info(f'{replica_identity.capitalize()} is not ready.') pass return self, False, probe_time @@ -407,7 +407,7 @@ def _launch_replica(self, replica_id: int) -> None: logger.warning(f'Launch process for replica {replica_id} ' 'already exists. Skipping.') return - logger.info(f'Launching replica {replica_id}') + logger.info(f'Launching replica {replica_id}...') cluster_name = serve_utils.generate_replica_cluster_name( self.service_name, replica_id) log_file_name = serve_utils.generate_replica_launch_log_file_name( @@ -447,9 +447,9 @@ def _sync_down_logs(): handle = global_user_state.get_handle_from_cluster_name( info.cluster_name) if handle is None: - logger.error(f'Cannot find cluster {info.cluster_name} ' - 'in the cluster table. Skipping syncing ' - 'down logs.') + logger.error(f'Cannot find cluster {info.cluster_name} for ' + f'replica {replica_id} in the cluster table. ' + 'Skipping syncing down logs.') return replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, 'replica_jobs') @@ -468,7 +468,7 @@ def _sync_down_logs(): if sync_down_logs: _sync_down_logs() - logger.info(f'Deleting replica {replica_id}') + logger.info(f'Terminating replica {replica_id}...') info = serve_state.get_replica_info_from_id(self.service_name, replica_id) assert info is not None @@ -529,15 +529,15 @@ def _refresh_process_pool(self) -> None: info) for replica_id, p in list(self.down_process_pool.items()): if not p.is_alive(): - logger.info(f'Down process for replica {replica_id} finished.') + logger.info( + f'Terminate process for replica {replica_id} finished.') del self.down_process_pool[replica_id] info = serve_state.get_replica_info_from_id( self.service_name, replica_id) assert info is not None if p.exitcode != 0: - logger.error( - f'Down process for replica {replica_id} exited ' - f'abnormally with code {p.exitcode}.') + logger.error(f'Down process for replica {replica_id} ' + f'exited abnormally with code {p.exitcode}.') info.status_property.sky_down_status = ( ProcessStatus.FAILED) else: @@ -643,11 +643,15 @@ def _probe_all_replicas(self) -> None: for info in infos: if not info.status_property.should_track_status(): continue - replica_to_probe.append((info.cluster_name, info.ip)) + replica_to_probe.append( + f'replica_{info.replica_id}(ip={info.ip})') probe_futures.append( - executor.submit(info.probe, self.readiness_suffix, - self.post_data)) - logger.info(f'Replicas to probe: {replica_to_probe}') + executor.submit( + info.probe, + self.readiness_suffix, + self.post_data, + )) + logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}') # Since futures.as_completed will return futures in the order of # completion, we need the info.probe function to return the info From 4ee5676e882be15f3e5d12aab6442ff5128643c8 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 22 Oct 2023 17:02:45 -0700 Subject: [PATCH 146/223] UX, refactoring --- sky/backends/backend_utils.py | 33 ++++++++++++++++++++++ sky/cli.py | 43 ++++++++--------------------- sky/core.py | 6 ++-- sky/execution.py | 7 +++-- sky/serve/__init__.py | 1 - sky/serve/constants.py | 2 -- sky/spot/__init__.py | 2 -- sky/spot/constants.py | 2 -- sky/utils/cli_utils/status_utils.py | 3 +- 9 files changed, 52 insertions(+), 47 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index d3a4bdd67eb..61baba1e08c 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -103,6 +103,10 @@ # Note: This value cannot be too small, otherwise OOM issue may occur. DEFAULT_TASK_CPU_DEMAND = 0.5 +# The default idle timeout for skypilot controllers. This include spot +# controller and sky serve controller. +CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 + @dataclasses.dataclass class ReservedClusterRecord: @@ -111,6 +115,8 @@ class ReservedClusterRecord: check: Callable[[str], bool] sky_status_hint: str decline_cancel_hint: str + decline_down_in_init_status_hint: str + decline_down_for_dirty_controller_hint: str check_cluster_name_hint: str @@ -128,6 +134,18 @@ class ReservedClusterGroup(enum.Enum): 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel [--all]{colorama.Style.RESET_ALL}'), + decline_down_in_init_status_hint=( + f'{colorama.Fore.RED}Tearing down the spot controller while ' + 'it is in INIT state is not supported (this means a spot launch ' + 'is in progress or the previous launch failed), as we cannot ' + 'guarantee that all the spot jobs are finished. Please wait ' + 'until the spot controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), + decline_down_for_dirty_controller_hint=( + f'{colorama.Fore.RED}In-progress spot jobs found. To avoid ' + f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}' + f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), check_cluster_name_hint=( f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' 'managed spot controller. ')) @@ -139,6 +157,21 @@ class ReservedClusterGroup(enum.Enum): f'sky serve status{colorama.Style.RESET_ALL}'), decline_cancel_hint=( 'Cancelling the sky serve controller\'s jobs is not allowed.'), + decline_down_in_init_status_hint=( + f'{colorama.Fore.RED}Tearing down the sky serve controller ' + 'while it is in INIT state is not supported (this means a sky ' + 'serve up is in progress or the previous launch failed), as we ' + 'cannot guarantee that all the services are terminated. Please ' + 'wait until the sky serve controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{serve_lib.SKY_SERVE_CONTROLLER_NAME}' + f'{colorama.Style.RESET_ALL}.'), + decline_down_for_dirty_controller_hint=( + f'{colorama.Fore.RED}Tearing down the sky serve controller is not ' + 'supported, as it is currently serving the following services: ' + '{service_names}. Please terminate the services first with ' + f'{colorama.Style.BRIGHT}sky serve down -a' + f'{colorama.Style.RESET_ALL}.'), check_cluster_name_hint=( f'Cluster {serve_lib.SKY_SERVE_CONTROLLER_NAME} is reserved for ' 'sky serve controller. ')) diff --git a/sky/cli.py b/sky/cli.py index bad605870e9..4d4499ce37a 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -51,7 +51,6 @@ from sky import core from sky import exceptions from sky import global_user_state -from sky import serve as serve_lib from sky import sky_logging from sky import spot as spot_lib from sky import status_lib @@ -2760,17 +2759,13 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): click.echo('Managed spot controller has already been torn down.') return + group = backend_utils.ReservedClusterGroup.check_cluster_name( + controller_name) + assert group is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - f'{colorama.Fore.RED}Tearing down the spot controller while ' - 'it is in INIT state is not supported (this means a spot ' - 'launch is in progress or the previous launch failed), as we ' - 'cannot ' - 'guarantee that all the spot jobs are finished. Please wait ' - 'until the spot controller is UP or fix it with ' - f'{colorama.Style.BRIGHT}sky start ' - f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.') + group.value.decline_down_in_init_status_hint) msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' f'spot controller ({cluster_status.value}). Please be ' f'aware of the following:{colorama.Style.RESET_ALL}' @@ -2799,10 +2794,7 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): non_terminal_jobs): job_table = spot_lib.format_job_table(non_terminal_jobs, show_all=False) - msg = (f'{colorama.Fore.RED}In-progress spot jobs found. ' - 'To avoid resource leakage, cancel all jobs first: ' - f'{colorama.Style.BRIGHT}sky spot cancel -a' - f'{colorama.Style.RESET_ALL}\n') + msg = group.value.decline_down_for_dirty_controller_hint # Add prefix to each line to align with the bullet point. msg += '\n'.join( [' ' + line for line in job_table.split('\n') if line != '']) @@ -2820,18 +2812,13 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): click.echo('Sky serve controller has already been torn down.') return + group = backend_utils.ReservedClusterGroup.check_cluster_name( + controller_name) + assert group is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: - # TODO(tian): Refactor to reserved group record. with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - f'{colorama.Fore.RED}Tearing down the sky serve controller ' - 'while it is in INIT state is not supported (this means a sky ' - 'serve up is in progress or the previous launch failed), as we ' - 'cannot guarantee that all the services are terminated. Please ' - 'wait until the sky serve controller is UP or fix it with ' - f'{colorama.Style.BRIGHT}sky start ' - f'{serve_lib.SKY_SERVE_CONTROLLER_NAME}' - f'{colorama.Style.RESET_ALL}.') + group.value.decline_down_in_init_status_hint) elif cluster_status == status_lib.ClusterStatus.UP: with rich_utils.safe_status( '[bold cyan]Checking for running services[/]'): @@ -2844,14 +2831,9 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): if services: service_names = [service['name'] for service in services] with ux_utils.print_exception_no_traceback(): - plural = '' if len(service_names) == 1 else 's' - raise exceptions.NotSupportedError( - f'{colorama.Fore.RED}Tearing down the sky serve controller ' - f'is not supported, as it is currently serving the ' - f'following service{plural}: {", ".join(service_names)}. ' - f'Please terminate the service{plural} first with ' - f'{colorama.Style.BRIGHT}sky serve down ' - f'{" ".join(service_names)}{colorama.Style.RESET_ALL}.') + msg = group.value.decline_down_for_dirty_controller_hint.format( + service_names=', '.join(service_names)) + raise exceptions.NotSupportedError(msg) # Do nothing for STOPPED state, as it is safe to terminate the cluster. click.echo(f'Terminate sky serve controller: {controller_name}.') @@ -4462,7 +4444,6 @@ def serve_logs( # Tail the controller logs of a service: sky serve logs --controller --target load-balancer [SERVICE_ID] """ - # TODO(tian): nit: use sum([...]) have_replica_id = replica_id is not None num_flags = (controller + load_balancer + have_replica_id) if num_flags > 1: diff --git a/sky/core.py b/sky/core.py index c4a79d5f021..6a84bcdb0f6 100644 --- a/sky/core.py +++ b/sky/core.py @@ -195,10 +195,8 @@ def _start( 'supported when starting skypilot controllers. To ' 'fix: omit the `idle_minutes_to_autostop` argument to use the ' f'default autostop settings (got: {idle_minutes_to_autostop}).') - # TODO(tian): Maybe we should merge the two MINUTES_TO_AUTOSTOP - # together. Currently, the two value is the same so we just use spot - # constant here. - idle_minutes_to_autostop = spot.SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP + idle_minutes_to_autostop = ( + backend_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) # NOTE: if spot_queue() calls _start() and hits here, that entrypoint # would have a cluster name (the controller) filled in. diff --git a/sky/execution.py b/sky/execution.py index 17ace038f7b..881fc49a181 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -779,8 +779,8 @@ def spot_launch( stream_logs=stream_logs, cluster_name=controller_name, detach_run=detach_run, - idle_minutes_to_autostop=spot. - SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, + idle_minutes_to_autostop=backend_utils. + CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) @@ -1074,6 +1074,7 @@ def serve_up( # We use autostop here to reduce cold start time, since in most # cases the controller resources requirement will be the default # value and a previous controller could be reused. - idle_minutes_to_autostop=serve.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, + idle_minutes_to_autostop=backend_utils. + CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index e140437926d..02011c9a40f 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -1,7 +1,6 @@ """Modules for SkyServe services.""" import os -from sky.serve.constants import CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP from sky.serve.constants import CONTROLLER_RESOURCES from sky.serve.constants import CONTROLLER_TEMPLATE from sky.serve.constants import ENDPOINT_PROBE_INTERVAL diff --git a/sky/serve/constants.py b/sky/serve/constants.py index ab7b1f793d7..3e1d25be52b 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -1,7 +1,5 @@ """Constants used for SkyServe.""" -CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 - CONTROLLER_TEMPLATE = 'sky-serve-controller.yaml.j2' SKYSERVE_METADATA_DIR = '~/.sky/serve' diff --git a/sky/spot/__init__.py b/sky/spot/__init__.py index 7c4cdb989aa..f14f9ae25ad 100644 --- a/sky/spot/__init__.py +++ b/sky/spot/__init__.py @@ -2,7 +2,6 @@ import pathlib from sky.spot.constants import SPOT_CLUSTER_NAME_PREFIX_LENGTH -from sky.spot.constants import SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP from sky.spot.constants import SPOT_CONTROLLER_TEMPLATE from sky.spot.constants import SPOT_CONTROLLER_YAML_PREFIX from sky.spot.constants import SPOT_TASK_YAML_PREFIX @@ -23,7 +22,6 @@ 'SPOT_DEFAULT_STRATEGY', 'SPOT_CONTROLLER_NAME', # Constants - 'SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP', 'SPOT_CONTROLLER_TEMPLATE', 'SPOT_CONTROLLER_YAML_PREFIX', 'SPOT_TASK_YAML_PREFIX', diff --git a/sky/spot/constants.py b/sky/spot/constants.py index 2e01d64e939..841a8fafaaa 100644 --- a/sky/spot/constants.py +++ b/sky/spot/constants.py @@ -1,7 +1,5 @@ """Constants used for Managed Spot.""" -SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 - SPOT_CONTROLLER_TEMPLATE = 'spot-controller.yaml.j2' SPOT_CONTROLLER_YAML_PREFIX = '~/.sky/spot_controller' diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 620b54549f9..0db43300217 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -8,7 +8,6 @@ from sky import backends from sky import global_user_state from sky import serve -from sky import spot from sky import status_lib from sky.backends import backend_utils from sky.utils import common_utils @@ -273,7 +272,7 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord], if cluster_records: if reserved_group_name is not None: - autostop_minutes = spot.SPOT_CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP + autostop_minutes = backend_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'{reserved_group_name}{colorama.Style.RESET_ALL}' f'{colorama.Style.DIM} (will be autostopped if idle for ' From ce82675206a7797def978be6bee8e6d064bfbab9 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 22 Oct 2023 23:05:19 -0700 Subject: [PATCH 147/223] rephrase hint after sky serve up --- sky/backends/cloud_vm_ray_backend.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 0b47d06b05a..78d7c99f0a5 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3402,7 +3402,11 @@ def _exec_code_on_head( '\nTo see detailed info:\t\t' f'{backend_utils.BOLD}sky serve status {sn} (-a)' f'{backend_utils.RESET_BOLD}' - '\nTo see logs of one replica:\t' + '\nTo teardown the service:\t' + f'{backend_utils.BOLD}sky serve down {sn}' + f'{backend_utils.RESET_BOLD}' + '\n' + '\nTo see logs of a replica:\t' f'{backend_utils.BOLD}sky serve logs {sn} [REPLICA_ID]' f'{backend_utils.RESET_BOLD}' '\nTo see logs of load balancer:\t' @@ -3411,20 +3415,17 @@ def _exec_code_on_head( '\nTo see logs of controller:\t' f'{backend_utils.BOLD}sky serve logs --controller {sn}' f'{backend_utils.RESET_BOLD}' - '\nTo teardown the service:\t\t' - f'{backend_utils.BOLD}sky serve down {sn}' - f'{backend_utils.RESET_BOLD}' + '\n' '\nTo monitor replica status:\t' f'{backend_utils.BOLD}watch -n10 sky serve status {sn}' f'{backend_utils.RESET_BOLD}' '\nTo send a test request:\t\t' f'{backend_utils.BOLD}curl -L $(sky serve status {sn} ' f'--endpoint){backend_utils.RESET_BOLD}' - f'\n(use {backend_utils.BOLD}sky serve status {sn}' - f'{backend_utils.RESET_BOLD} to get all valid [REPLICA_ID])' - f'\n{style.BRIGHT}{fore.GREEN}SkyServe is bootstrapping ' - 'your service now. The endpoint and replicas should be ' - f'ready within a short time.{style.RESET_ALL}') + f'\n{fore.GREEN}SkyServe is bootstrapping your service now.' + f'{style.RESET_ALL}' + f'\n{fore.GREEN}The endpoint and replicas should be ready ' + f'within a short time.{style.RESET_ALL}') else: logger.info(f'{fore.CYAN}Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' From ce283f18153b4b3813ba74db0933a9d749345598 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sun, 22 Oct 2023 23:11:56 -0700 Subject: [PATCH 148/223] Update sky/execution.py Co-authored-by: Zongheng Yang --- sky/execution.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/execution.py b/sky/execution.py index 881fc49a181..29d5319fc5a 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -61,6 +61,8 @@ sky.spot_launch(task, ...) """.strip() + +# ... _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = ( '{controller_type} controller resources is not valid, please check ' '~/.sky/config.yaml file and make sure ' From 05aec69daf5677d442ac5db80529c961e7d4cd94 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 22 Oct 2023 23:21:27 -0700 Subject: [PATCH 149/223] comments --- sky/core.py | 2 +- sky/execution.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/core.py b/sky/core.py index 6a84bcdb0f6..b7de65768c6 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1130,7 +1130,7 @@ def serve_down(service_names: Optional[Union[str, List[str]]] = None, all: bool = False) -> None: """Teardown a service. - Please refer to the sky.cli.serve_down for the document. + Please refer to the sky.cli.serve_down for the docs. Args: service_names: Name of the service(s). diff --git a/sky/execution.py b/sky/execution.py index 29d5319fc5a..2ef6bc42f9a 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -62,7 +62,8 @@ sky.spot_launch(task, ...) """.strip() -# ... +# Message thrown when APIs sky.{spot_launch,serve_up}() received an invalid +# controller resources spec. _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = ( '{controller_type} controller resources is not valid, please check ' '~/.sky/config.yaml file and make sure ' From bd835d45fcd816c346ddac0107db1c09d65f6c3e Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 23 Oct 2023 15:55:30 -0700 Subject: [PATCH 150/223] add service name check before sky serve up --- sky/execution.py | 107 +++++++++++++++++++++++++++++---------- sky/serve/__init__.py | 1 + sky/serve/serve_state.py | 37 ++++++++++---- sky/serve/serve_utils.py | 24 ++++++++- sky/serve/service.py | 17 ++++--- 5 files changed, 141 insertions(+), 45 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 2ef6bc42f9a..98c69a646e6 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -12,6 +12,7 @@ import uuid import colorama +import filelock import sky from sky import backends @@ -70,6 +71,8 @@ '{controller_type}.controller.resources is a valid resources spec. ' 'Details:\n {err}') +_SERVE_UP_NAME_LOCK_PATH = '/tmp/sky_serve_up_{}.lock' + def _convert_to_dag(entrypoint: Any) -> 'sky.Dag': """Convert the entrypoint to a sky.Dag. @@ -959,33 +962,39 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task, storage_obj.force_delete = True -@usage_lib.entrypoint -def serve_up( - task: 'sky.Task', - service_name: Optional[str] = None, -) -> None: - """Spin up a service. +def _register_service_name(service_name: str) -> bool: + """Register a service name on the controller if it is running. - Please refer to the sky.cli.serve_up for the document. - - Args: - task: sky.Task to serve up. - service_name: Name of the service. + Returns: + True if the service name is registered successfully, False otherwise. """ - if service_name is None: - service_name = backend_utils.generate_service_name() - - # The service name will be used as: - # 1. controller cluster name: 'sky-serve-controller-' - # 2. replica cluster name: '-' - # In both cases, service name shares the same regex with cluster name. - if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service name {service_name!r} is invalid: ' - f'ensure it is fully matched by regex (e.g., ' - 'only contains lower letters, numbers and dash): ' - f'{constants.CLUSTER_NAME_VALID_REGEX}') - + with sky_logging.silent(): + _, handle = backend_utils.is_controller_up(is_spot=False, + stopped_message='') + if handle is None or handle.head_ip is None: + # The sky serve controller is STOPPED, or it is the first time + # provisioning either after an AUTOSTOP, or the first time the + # controller is created, which means there is no service on the + # controller. We will create the service database record in + # sky.serve.service._start once the controller is running. + logger.info('The sky serve controller is not running. ' + 'Will register the service once the controller is up.') + return True + # The sky serve controller is UP, check if the service exists. + code = serve.ServeCodeGen.add_service_if_not_exist(service_name) + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + returncode, stdout, _ = backend.run_on_head(handle, + code, + require_outputs=True, + stream_logs=False) + subprocess_utils.handle_returncode( + returncode, code, 'Failed to register service name on controller', + stdout) + return serve.load_add_service_result(stdout) + + +def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: if task.service is None: with ux_utils.print_exception_no_traceback(): raise RuntimeError('Service section not found.') @@ -1023,6 +1032,16 @@ def serve_up( task.set_resources( requested_resources.copy(ports=[task.service.replica_port])) + with rich_utils.safe_status( + '[cyan]Registering service on the controller[/]'): + success = _register_service_name(service_name) + if not success: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'The service {service_name!r} is already running. ' + 'Update service will be supported in the future. For now, ' + '`sky serve down` and then `sky serve up` again.') + controller_name = serve.SKY_SERVE_CONTROLLER_NAME # TODO(tian): Probably run another sky.launch after we get the load balancer # port from the controller? So we don't need to open so many ports here. Or, @@ -1074,10 +1093,42 @@ def serve_up( stream_logs=False, cluster_name=controller_name, detach_run=True, - # We use autostop here to reduce cold start time, since in most - # cases the controller resources requirement will be the default - # value and a previous controller could be reused. idle_minutes_to_autostop=backend_utils. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) + + +@usage_lib.entrypoint +def serve_up( + task: 'sky.Task', + service_name: Optional[str] = None, +) -> None: + """Spin up a service. + + Please refer to the sky.cli.serve_up for the document. + + Args: + task: sky.Task to serve up. + service_name: Name of the service. + """ + if service_name is None: + service_name = backend_utils.generate_service_name() + + # The service name will be used as: + # 1. controller cluster name: 'sky-serve-controller-' + # 2. replica cluster name: '-' + # In both cases, service name shares the same regex with cluster name. + if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service name {service_name!r} is invalid: ' + f'ensure it is fully matched by regex (e.g., ' + 'only contains lower letters, numbers and dash): ' + f'{constants.CLUSTER_NAME_VALID_REGEX}') + + # We need this lock to make sure no two sky.serve_up() with same service + # name are running at the same time. It is for the race condition that + # two of them are trying to create a record in controller services database + # but the controller is not up yet. + with filelock.FileLock(_SERVE_UP_NAME_LOCK_PATH.format(service_name)): + _serve_up_no_lock(task, service_name) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 02011c9a40f..fe91b577bd6 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -13,6 +13,7 @@ from sky.serve.serve_utils import generate_remote_controller_log_file_name from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name +from sky.serve.serve_utils import load_add_service_result from sky.serve.serve_utils import load_latest_info from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 7801c33a9f4..418862e5908 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -29,14 +29,14 @@ _CURSOR.execute("""\ CREATE TABLE IF NOT EXISTS services ( name TEXT PRIMARY KEY, - controller_job_id INTEGER, + controller_job_id INTEGER DEFAULT NULL, controller_port INTEGER DEFAULT NULL, load_balancer_port INTEGER DEFAULT NULL, status TEXT, uptime INTEGER DEFAULT NULL, - policy TEXT, - auto_restart INTEGER, - requested_resources BLOB)""") + policy TEXT DEFAULT NULL, + auto_restart INTEGER DEFAULT NULL, + requested_resources BLOB DEFAULT NULL)""") _CURSOR.execute("""\ CREATE TABLE IF NOT EXISTS replicas ( service_name TEXT, @@ -45,6 +45,8 @@ PRIMARY KEY (service_name, replica_id))""") _CONN.commit() +_UNIQUE_CONSTRAINT_FAILED_ERROR_MSG = 'UNIQUE constraint failed: services.name' + # === Statuses === class ReplicaStatus(enum.Enum): @@ -169,14 +171,30 @@ def from_replica_statuses( # === Service functions === -def add_service(name: str, controller_job_id: int, policy: str, - auto_restart: bool, requested_resources: 'sky.Resources', - status: ServiceStatus) -> None: +def add_service_if_not_exist(name: str) -> bool: """Adds a service to the database.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + try: + cursor.execute( + """\ + INSERT INTO services (name, status) + VALUES (?, ?)""", (name, ServiceStatus.CONTROLLER_INIT.value)) + except sqlite3.IntegrityError as e: + if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG: + raise RuntimeError('Unexpected database error') from e + return False + return True + + +def add_or_update_service(name: str, controller_job_id: int, policy: str, + auto_restart: bool, + requested_resources: 'sky.Resources', + status: ServiceStatus) -> None: + """Updates a service in the database.""" with db_utils.safe_cursor(_DB_PATH) as cursor: cursor.execute( """\ - INSERT INTO services + INSERT OR REPLACE INTO services (name, controller_job_id, status, policy, auto_restart, requested_resources) VALUES (?, ?, ?, ?, ?, ?)""", @@ -243,7 +261,8 @@ def _get_service_from_row(row) -> Dict[str, Any]: 'uptime': uptime, 'policy': policy, 'auto_restart': bool(auto_restart), - 'requested_resources': pickle.loads(requested_resources), + 'requested_resources': pickle.loads(requested_resources) + if requested_resources is not None else None, } diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 1fa3d54cf8a..f77c71c2b98 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -251,13 +251,27 @@ def update_service_status() -> None: if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN: # Skip services that is shutting down. continue - controller_status = job_lib.get_status(record['controller_job_id']) + controller_job_id = record['controller_job_id'] + if controller_job_id is None: + # The service just registered and the controller job is not + # scheduled yet. + continue + controller_status = job_lib.get_status(controller_job_id) if controller_status is None or controller_status.is_terminal(): # If controller job is not running, set it as controller failed. serve_state.set_service_status( record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED) +def add_service_if_not_exist(service_name: str) -> str: + return common_utils.encode_payload( + serve_state.add_service_if_not_exist(service_name)) + + +def load_add_service_result(payload: str) -> bool: + return common_utils.decode_payload(payload) + + def get_replica_info(service_name: str, with_handle: bool) -> List[Dict[str, Any]]: """Get the information of all replicas of the service. @@ -544,6 +558,14 @@ class ServeCodeGen: 'from sky.serve import serve_utils', ] + @classmethod + def add_service_if_not_exist(cls, service_name: str) -> str: + code = [ + f'msg = serve_utils.add_service_if_not_exist({service_name!r})', + 'print(msg, end="", flush=True)' + ] + return cls._build(code) + @classmethod def get_latest_info(cls, service_names: Optional[List[str]]) -> str: code = [ diff --git a/sky/serve/service.py b/sky/serve/service.py index 124d84dee90..224acdf534a 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -121,13 +121,16 @@ def _start(service_name: str, task_yaml: str, job_id: int): status = serve_state.ServiceStatus.CONTROLLER_INIT if len(serve_state.get_services()) >= serve_utils.NUM_SERVICE_THRESHOLD: status = serve_state.ServiceStatus.PENDING - # TODO(tian): Use id as identifier instead of name. - serve_state.add_service(service_name, - controller_job_id=job_id, - policy=service_spec.policy_str(), - auto_restart=service_spec.auto_restart, - requested_resources=requested_resources, - status=status) + # Here, the service record might already registered in the database if the + # controller is UP, but also might not if the controller is STOPPED or not + # created yet before this service. So we use add_or_update_service here. + # See sky.execution._register_service_name for more details. + serve_state.add_or_update_service(service_name, + controller_job_id=job_id, + policy=service_spec.policy_str(), + auto_restart=service_spec.auto_restart, + requested_resources=requested_resources, + status=status) controller_process = None load_balancer_process = None From e17d789d3bc7834be579bc105129b7e778eaf9c8 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 23 Oct 2023 17:45:16 -0700 Subject: [PATCH 151/223] rename reserved cluster to controller --- sky/backends/backend_utils.py | 146 ++++++-------------- sky/backends/cloud_vm_ray_backend.py | 11 +- sky/cli.py | 95 ++++++------- sky/core.py | 196 ++++++++++++++++----------- sky/execution.py | 10 +- 5 files changed, 218 insertions(+), 240 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 61baba1e08c..c22261d5f19 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -14,8 +14,7 @@ import textwrap import time import typing -from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, - Union) +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union import uuid import colorama @@ -109,24 +108,25 @@ @dataclasses.dataclass -class ReservedClusterRecord: - """Record for reserved cluster group.""" - group_name: str - check: Callable[[str], bool] +class ControllerSpec: + """Spec for skypilot controllers.""" + name: str + cluster_name: str sky_status_hint: str decline_cancel_hint: str decline_down_in_init_status_hint: str decline_down_for_dirty_controller_hint: str check_cluster_name_hint: str + default_hint_if_non_existent: str -class ReservedClusterGroup(enum.Enum): - """Reserved cluster groups for skypilot.""" +class Controllers(enum.Enum): + """Skypilot controllers.""" # NOTE(dev): Keep this align with - # sky/cli.py::_RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE - SPOT_CONTROLLER = ReservedClusterRecord( - group_name='Managed spot controller', - check=lambda name: name == spot_lib.SPOT_CONTROLLER_NAME, + # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE + SPOT_CONTROLLER = ControllerSpec( + name='managed spot controller', + cluster_name=spot_lib.SPOT_CONTROLLER_NAME, sky_status_hint=( f'* To see detailed spot job status: {colorama.Style.BRIGHT}' f'sky spot queue{colorama.Style.RESET_ALL}'), @@ -148,10 +148,11 @@ class ReservedClusterGroup(enum.Enum): f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), check_cluster_name_hint=( f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' - 'managed spot controller. ')) - SKY_SERVE_CONTROLLER = ReservedClusterRecord( - group_name='Sky Serve controller', - check=lambda name: name == serve_lib.SKY_SERVE_CONTROLLER_NAME, + 'managed spot controller. '), + default_hint_if_non_existent='No managed spot jobs are found.') + SKY_SERVE_CONTROLLER = ControllerSpec( + name='sky serve controller', + cluster_name=serve_lib.SKY_SERVE_CONTROLLER_NAME, sky_status_hint=( f'* To see detailed service status: {colorama.Style.BRIGHT}' f'sky serve status{colorama.Style.RESET_ALL}'), @@ -174,22 +175,20 @@ class ReservedClusterGroup(enum.Enum): f'{colorama.Style.RESET_ALL}.'), check_cluster_name_hint=( f'Cluster {serve_lib.SKY_SERVE_CONTROLLER_NAME} is reserved for ' - 'sky serve controller. ')) + 'sky serve controller. '), + default_hint_if_non_existent='No service is found.') @classmethod - def check_cluster_name( - cls, name: Optional[str]) -> Optional['ReservedClusterGroup']: - """Check if the cluster name is reserved. + def check_cluster_name(cls, name: Optional[str]) -> Optional['Controllers']: + """Check if the cluster name is a controller name. Returns: - The group name if the cluster name is reserved. + The controller if the cluster name is a controller name. Otherwise, returns None. """ - if name is None: - return None - for group in cls: - if group.value.check(name): - return group + for controller in cls: + if controller.value.cluster_name == name: + return controller return None @@ -2589,9 +2588,8 @@ def check_cluster_available( return handle -# TODO(tian): Probably use ReservedClusterGroup and add some attr for the msg? def is_controller_up( - is_spot: bool, + controller_type: Controllers, stopped_message: str, non_existent_message: Optional[str] = None, ) -> Tuple[Optional[status_lib.ClusterStatus], @@ -2603,7 +2601,7 @@ def is_controller_up( controller. Args: - is_spot: Whether the type of the controller is spot. + type: Type of the controller. stopped_message: Message to print if the controller is STOPPED. non_existent_message: Message to show if the controller does not exist. @@ -2620,16 +2618,10 @@ def is_controller_up( exceptions.CloudUserIdentityError: if we fail to get the current user identity. """ - if is_spot: - controller_name = spot_lib.SPOT_CONTROLLER_NAME - controller_hint = 'spot' - if non_existent_message is None: - non_existent_message = 'No managed spot jobs are found.' - else: - controller_name = serve_lib.SKY_SERVE_CONTROLLER_NAME - controller_hint = 'sky serve' - if non_existent_message is None: - non_existent_message = 'No service is found.' + if non_existent_message is None: + non_existent_message = controller_type.value.default_hint_if_non_existent + cluster_name = controller_type.value.cluster_name + controller_name = controller_type.value.name.replace(' controller', '') try: # Set force_refresh_statuses=None to make sure the refresh only happens # when the controller is INIT/UP (triggered in these statuses as the @@ -2638,17 +2630,17 @@ def is_controller_up( # This optimization is based on the assumption that the user will not # start the controller manually from the cloud console. controller_status, handle = refresh_cluster_status_handle( - controller_name, force_refresh_statuses=None) + cluster_name, force_refresh_statuses=None) except exceptions.ClusterStatusFetchingError as e: # We do not catch the exceptions related to the cluster owner identity # mismatch, please refer to the comment in # `backend_utils.check_cluster_available`. logger.warning( 'Failed to get the status of the controller. It is not ' - f'fatal, but {controller_hint} commands/calls may hang or return ' + f'fatal, but {controller_name} commands/calls may hang or return ' 'stale information, when the controller is not up.\n' f' Details: {common_utils.format_exception(e, use_bracket=True)}') - record = global_user_state.get_cluster_from_name(controller_name) + record = global_user_state.get_cluster_from_name(cluster_name) controller_status, handle = None, None if record is not None: controller_status, handle = record['status'], record['handle'] @@ -2656,7 +2648,7 @@ def is_controller_up( if controller_status is None: sky_logging.print(non_existent_message) elif controller_status != status_lib.ClusterStatus.UP: - msg = (f'{controller_hint.capitalize()} controller {controller_name} ' + msg = (f'{controller_name.capitalize()} controller {cluster_name} ' f'is {controller_status.value}.') if controller_status == status_lib.ClusterStatus.STOPPED: msg += f'\n{stopped_message}' @@ -2709,7 +2701,7 @@ def get_clusters( if not include_reserved: records = [ record for record in records - if ReservedClusterGroup.check_cluster_name(record['name']) is None + if Controllers.check_cluster_name(record['name']) is None ] yellow = colorama.Fore.YELLOW @@ -2819,61 +2811,6 @@ def _refresh_cluster(cluster_name): return kept_records -def refresh_service_status( - service_names: Optional[List[str]]) -> List[Dict[str, Any]]: - """Refresh the status of the services. - - Args: - service_names: If provided, only refresh the status of the specified - services. Otherwise, refresh the status of all services. - - Returns: - A list of updated service records. - """ - try: - check_network_connection() - except exceptions.NetworkError: - logger.warning('Failed to refresh service status due to network error.') - return [] - - # TODO(tian): This is so slow... It will take ~10s to refresh the status - # of controller. Can we optimize this? - controller_status, handle = is_controller_up( - is_spot=False, stopped_message='No service is found.') - - if handle is None or handle.head_ip is None: - # When the controller is STOPPED, the head_ip will be None, as - # it will be set in global_user_state.remove_cluster(). - # We do not directly check for UP because the controller may be - # in INIT state during another spot launch, but still have - # head_ip available. In this case, we can still try to ssh - # into the controller and fetch the job table. - raise exceptions.ClusterNotUpError('Sky serve controller is not up.', - cluster_status=controller_status) - - backend = get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend) - - code = serve_lib.ServeCodeGen.get_latest_info(service_names) - returncode, latest_info_payload, stderr = backend.run_on_head( - handle, - code, - require_outputs=True, - stream_logs=False, - separate_stderr=True) - - try: - subprocess_utils.handle_returncode(returncode, - code, - 'Failed to fetch services', - stderr, - stream_logs=False) - except exceptions.CommandError as e: - raise RuntimeError(e.error_msg) from e - - return serve_lib.load_latest_info(latest_info_payload) - - # Internal only: def download_and_stream_latest_job_log( backend: 'cloud_vm_ray_backend.CloudVmRayBackend', @@ -2990,7 +2927,10 @@ def get_task_resources_str(task: 'task_lib.Task') -> str: def check_cluster_name_not_reserved( cluster_name: Optional[str], operation_str: Optional[str] = None) -> None: - """Errors out if the cluster name is reserved (e.g., spot/serve controller). + """Errors out if the cluster name is reserved. + + Currently, all reserved cluster names are skypilot controller, i.e. + spot controller/sky serve controller. Raises: sky.exceptions.NotSupportedError: if the cluster name is reserved, raise @@ -2999,9 +2939,9 @@ def check_cluster_name_not_reserved( Returns: None, if the cluster name is not reserved. """ - group = ReservedClusterGroup.check_cluster_name(cluster_name) - if group is not None: - msg = group.value.check_cluster_name_hint + controller = Controllers.check_cluster_name(cluster_name) + if controller is not None: + msg = controller.value.check_cluster_name_hint if operation_str is not None: msg += f' {operation_str} is not allowed.' with ux_utils.print_exception_no_traceback(): diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 78d7c99f0a5..3a342d79571 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3373,8 +3373,8 @@ def _exec_code_on_head( self.tail_logs(handle, job_id) finally: name = handle.cluster_name - group = backend_utils.ReservedClusterGroup.check_cluster_name(name) - if group == backend_utils.ReservedClusterGroup.SPOT_CONTROLLER: + controller = backend_utils.Controllers.check_cluster_name(name) + if controller == backend_utils.Controllers.SPOT_CONTROLLER: logger.info( f'{fore.CYAN}Spot Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' @@ -3393,8 +3393,7 @@ def _exec_code_on_head( '\nTo view the spot job dashboard:\t' f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') - elif (group == - backend_utils.ReservedClusterGroup.SKY_SERVE_CONTROLLER): + elif controller == backend_utils.Controllers.SKY_SERVE_CONTROLLER: sn = service_name logger.info( f'{fore.CYAN}Service name: ' @@ -3543,8 +3542,8 @@ def _post_execute(self, handle: CloudVmRayResourceHandle, fore = colorama.Fore style = colorama.Style name = handle.cluster_name - group = backend_utils.ReservedClusterGroup.check_cluster_name(name) - if group is not None or down: + controller = backend_utils.Controllers.check_cluster_name(name) + if controller is not None or down: return stop_str = ('\nTo stop the cluster:' f'\t{backend_utils.BOLD}sky stop {name}' diff --git a/sky/cli.py b/sky/cli.py index 36289a9ecd6..879ba69f358 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1889,11 +1889,11 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, reserved_clusters = [] for cluster_record in cluster_records: cluster_name = cluster_record['name'] - group = backend_utils.ReservedClusterGroup.check_cluster_name( + controller = backend_utils.Controllers.check_cluster_name( cluster_name) - if group is not None: + if controller is not None: reserved_clusters.append(cluster_record) - hints.append(group.value.sky_status_hint) + hints.append(controller.value.sky_status_hint) else: nonreserved_cluster_records.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( @@ -2022,14 +2022,13 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin reserved_clusters = dict() for cluster_record in cluster_records: cluster_name = cluster_record['name'] - group = backend_utils.ReservedClusterGroup.check_cluster_name( - cluster_name) - if group is not None: - cluster_group_name = group.value.group_name + controller = backend_utils.Controllers.check_cluster_name(cluster_name) + if controller is not None: + controller_name = controller.value.name # to display most recent entry for each reserved cluster # TODO(sgurram): fix assumption of sorted order of clusters - if cluster_group_name not in reserved_clusters: - reserved_clusters[cluster_group_name] = cluster_record + if controller_name not in reserved_clusters: + reserved_clusters[controller_name] = cluster_record else: nonreserved_cluster_records.append(cluster_record) @@ -2037,9 +2036,11 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin nonreserved_cluster_records, all) status_utils.show_cost_report_table(nonreserved_cluster_records, all) - for cluster_group_name, cluster_record in reserved_clusters.items(): + for controller_name, cluster_record in reserved_clusters.items(): status_utils.show_cost_report_table( - [cluster_record], all, reserved_group_name=cluster_group_name) + [cluster_record], + all, + reserved_group_name=controller_name.capitalize()) total_cost += cluster_record['total_cost'] click.echo(f'\n{colorama.Style.BRIGHT}' @@ -2291,9 +2292,9 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) except exceptions.NotSupportedError: - group = backend_utils.ReservedClusterGroup.check_cluster_name(cluster) - assert group is not None, cluster - click.echo(group.value.decline_cancel_hint) + controller = backend_utils.Controllers.check_cluster_name(cluster) + assert controller is not None, cluster + click.echo(controller.value.decline_cancel_hint) sys.exit(1) except ValueError as e: raise click.UsageError(str(e)) @@ -2585,8 +2586,8 @@ def start( clusters = [ cluster['name'] for cluster in global_user_state.get_clusters() - if backend_utils.ReservedClusterGroup.check_cluster_name( - cluster['name']) is None + if backend_utils.Controllers.check_cluster_name(cluster['name']) is + None ] if not clusters: @@ -2654,8 +2655,7 @@ def start( # Checks for reserved clusters (spot controller). reserved, non_reserved = [], [] for name in to_start: - if backend_utils.ReservedClusterGroup.check_cluster_name( - name) is not None: + if backend_utils.Controllers.check_cluster_name(name) is not None: reserved.append(name) else: non_reserved.append(name) @@ -2773,13 +2773,12 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): click.echo('Managed spot controller has already been torn down.') return - group = backend_utils.ReservedClusterGroup.check_cluster_name( - controller_name) - assert group is not None, controller_name + controller = backend_utils.Controllers.check_cluster_name(controller_name) + assert controller is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - group.value.decline_down_in_init_status_hint) + controller.value.decline_down_in_init_status_hint) msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' f'spot controller ({cluster_status.value}). Please be ' f'aware of the following:{colorama.Style.RESET_ALL}' @@ -2808,7 +2807,7 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): non_terminal_jobs): job_table = spot_lib.format_job_table(non_terminal_jobs, show_all=False) - msg = group.value.decline_down_for_dirty_controller_hint + msg = controller.value.decline_down_for_dirty_controller_hint # Add prefix to each line to align with the bullet point. msg += '\n'.join( [' ' + line for line in job_table.split('\n') if line != '']) @@ -2826,13 +2825,12 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): click.echo('Sky serve controller has already been torn down.') return - group = backend_utils.ReservedClusterGroup.check_cluster_name( - controller_name) - assert group is not None, controller_name + controller = backend_utils.Controllers.check_cluster_name(controller_name) + assert controller is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - group.value.decline_down_in_init_status_hint) + controller.value.decline_down_in_init_status_hint) elif cluster_status == status_lib.ClusterStatus.UP: with rich_utils.safe_status( '[bold cyan]Checking for running services[/]'): @@ -2845,17 +2843,17 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): if services: service_names = [service['name'] for service in services] with ux_utils.print_exception_no_traceback(): - msg = group.value.decline_down_for_dirty_controller_hint.format( - service_names=', '.join(service_names)) + msg = (controller.value.decline_down_for_dirty_controller_hint. + format(service_names=', '.join(service_names))) raise exceptions.NotSupportedError(msg) # Do nothing for STOPPED state, as it is safe to terminate the cluster. click.echo(f'Terminate sky serve controller: {controller_name}.') -_RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE = { - backend_utils.ReservedClusterGroup.SPOT_CONTROLLER: +_CONTROLLER_TO_HINT_OR_RAISE = { + backend_utils.Controllers.SPOT_CONTROLLER: (_hint_or_raise_for_down_spot_controller), - backend_utils.ReservedClusterGroup.SKY_SERVE_CONTROLLER: + backend_utils.Controllers.SKY_SERVE_CONTROLLER: (_hint_or_raise_for_down_sky_serve_controller), } @@ -2903,13 +2901,12 @@ def _down_or_stop_clusters( if len(names) > 0: reserved_clusters = [ name for name in names - if backend_utils.ReservedClusterGroup.check_cluster_name(name) - is not None + if backend_utils.Controllers.check_cluster_name(name) is not None ] reserved_clusters_str = ', '.join(map(repr, reserved_clusters)) names = [ - name for name in _get_glob_clusters(names) if - backend_utils.ReservedClusterGroup.check_cluster_name(name) is None + name for name in _get_glob_clusters(names) + if backend_utils.Controllers.check_cluster_name(name) is None ] if not down: local_clusters = onprem_utils.check_and_get_local_clusters() @@ -2943,11 +2940,10 @@ def _down_or_stop_clusters( f'{operation} reserved cluster(s) ' f'{reserved_clusters_str} is currently not supported.') else: - reserved_group = (backend_utils.ReservedClusterGroup. - check_cluster_name(reserved_cluster)) - assert reserved_group is not None - hint_or_raise = _RESERVED_CLUSTER_GROUP_TO_HINT_OR_RAISE[ - reserved_group] + controller = backend_utils.Controllers.check_cluster_name( + reserved_cluster) + assert controller is not None + hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] hint_or_raise(reserved_cluster) confirm_str = 'delete' user_input = click.prompt( @@ -2969,10 +2965,8 @@ def _down_or_stop_clusters( # Otherwise, it would be very easy to accidentally delete a reserved # cluster. names = [ - record['name'] - for record in all_clusters - if backend_utils.ReservedClusterGroup.check_cluster_name( - record['name']) is None + record['name'] for record in all_clusters if + backend_utils.Controllers.check_cluster_name(record['name']) is None ] clusters = [] @@ -4023,7 +4017,7 @@ def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): $ sky spot cancel 1 2 3 """ _, handle = backend_utils.is_controller_up( - is_spot=True, + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, stopped_message='All managed spot jobs should have finished.') if handle is None: # Hint messages already printed by the call above. @@ -4107,9 +4101,10 @@ def spot_dashboard(port: Optional[int]): hint = ( 'Dashboard is not available if spot controller is not up. Run a spot ' 'job first.') - _, handle = backend_utils.is_controller_up(is_spot=True, - stopped_message=hint, - non_existent_message=hint) + _, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + stopped_message=hint, + non_existent_message=hint) if handle is None: sys.exit(1) # SSH forward a free local port to remote's dashboard port. @@ -4381,7 +4376,7 @@ def serve_down(service_names: List[str], all: bool, yes: bool): f'Provided {argument_str!r}.') _, handle = backend_utils.is_controller_up( - is_spot=False, + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='All services should have been terminated.') if handle is None: # Hint messages already printed by the call above. diff --git a/sky/core.py b/sky/core.py index b7de65768c6..c604264254a 100644 --- a/sky/core.py +++ b/sky/core.py @@ -183,8 +183,7 @@ def _start( f'Starting cluster {cluster_name!r} with backend {backend.NAME} ' 'is not supported.') - if backend_utils.ReservedClusterGroup.check_cluster_name( - cluster_name) is not None: + if backend_utils.Controllers.check_cluster_name(cluster_name) is not None: if down: raise ValueError('Using autodown (rather than autostop) is not ' 'supported for skypilot controllers. Pass ' @@ -300,8 +299,7 @@ def stop(cluster_name: str, purge: bool = False) -> None: sky.exceptions.NotSupportedError: if the specified cluster is a spot cluster, or a TPU VM Pod cluster, or the managed spot controller. """ - if backend_utils.ReservedClusterGroup.check_cluster_name( - cluster_name) is not None: + if backend_utils.Controllers.check_cluster_name(cluster_name) is not None: raise exceptions.NotSupportedError( f'Stopping sky reserved cluster {cluster_name!r} ' f'is not supported.') @@ -424,8 +422,7 @@ def autostop( if is_cancel: option_str = '{stop,down}' operation = f'{verb} auto{option_str}' - if backend_utils.ReservedClusterGroup.check_cluster_name( - cluster_name) is not None: + if backend_utils.Controllers.check_cluster_name(cluster_name) is not None: raise exceptions.NotSupportedError( f'{operation} sky reserved cluster {cluster_name!r} ' f'is not supported.') @@ -795,7 +792,8 @@ def spot_queue(refresh: bool, if not refresh: stop_msg = 'To view the latest job table: sky spot queue --refresh' controller_status, handle = backend_utils.is_controller_up( - is_spot=True, stopped_message=stop_msg) + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + stopped_message=stop_msg) if (refresh and controller_status in [ status_lib.ClusterStatus.STOPPED, status_lib.ClusterStatus.INIT @@ -867,7 +865,7 @@ def spot_cancel(name: Optional[str] = None, """ job_ids = [] if job_ids is None else job_ids cluster_status, handle = backend_utils.is_controller_up( - is_spot=True, + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, stopped_message='All managed spot jobs should have finished.') if handle is None or handle.head_ip is None: # The error message is already printed in backend_utils.is_controller_up @@ -926,7 +924,7 @@ def spot_tail_logs(name: Optional[str], job_id: Optional[int], """ # TODO(zhwu): Automatically restart the spot controller controller_status, handle = backend_utils.is_controller_up( - is_spot=True, + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, stopped_message=('Please restart the spot controller with ' f'`sky start {spot.SPOT_CONTROLLER_NAME}`.')) if handle is None or handle.head_ip is None: @@ -1047,7 +1045,117 @@ def serve_status( if service_names is not None: if isinstance(service_names, str): service_names = [service_names] - return backend_utils.refresh_service_status(service_names) + + try: + backend_utils.check_network_connection() + except exceptions.NetworkError as e: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'Failed to refresh service status due to network error.') from e + + # TODO(tian): This is so slow... It will take ~10s to refresh the status + # of controller. Can we optimize this? + controller_status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + stopped_message='No service is found.') + + if handle is None or handle.head_ip is None: + # When the controller is STOPPED, the head_ip will be None, as + # it will be set in global_user_state.remove_cluster(). + # We do not directly check for UP because the controller may be + # in INIT state during another spot launch, but still have + # head_ip available. In this case, we can still try to ssh + # into the controller and fetch the job table. + raise exceptions.ClusterNotUpError('Sky serve controller is not up.', + cluster_status=controller_status) + + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + + code = serve.ServeCodeGen.get_latest_info(service_names) + returncode, latest_info_payload, stderr = backend.run_on_head( + handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + + try: + subprocess_utils.handle_returncode(returncode, + code, + 'Failed to fetch services', + stderr, + stream_logs=False) + except exceptions.CommandError as e: + raise RuntimeError(e.error_msg) from e + + return serve.load_latest_info(latest_info_payload) + + +@usage_lib.entrypoint +# pylint: disable=redefined-builtin +def serve_down(service_names: Optional[Union[str, List[str]]] = None, + all: bool = False) -> None: + """Teardown a service. + + Please refer to the sky.cli.serve_down for the docs. + + Args: + service_names: Name of the service(s). + + Raises: + sky.exceptions.ClusterNotUpError: if the sky serve controller is not up. + ValueError: if the arguments are invalid. + RuntimeError: if failed to terminate the service. + """ + if service_names is None: + service_names = [] + if isinstance(service_names, str): + service_names = [service_names] + cluster_status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + stopped_message='All services should have terminated.') + if handle is None or handle.head_ip is None: + # The error message is already printed in backend_utils.is_controller_up + # TODO(zhwu): Move the error message into the exception. + with ux_utils.print_exception_no_traceback(): + raise exceptions.ClusterNotUpError('', + cluster_status=cluster_status) + + service_names_str = ','.join(service_names) + if sum([len(service_names) > 0, all]) != 1: + argument_str = f'service_names={service_names_str}' if len( + service_names) > 0 else '' + argument_str += ' all' if all else '' + raise ValueError('Can only specify one of service_names or all. ' + f'Provided {argument_str!r}.') + + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + if all: + code = serve.ServeCodeGen.terminate_services(None) + else: + code = serve.ServeCodeGen.terminate_services(service_names) + + try: + returncode, stdout, _ = backend.run_on_head(handle, + code, + require_outputs=True, + stream_logs=False) + except exceptions.FetchIPError as e: + raise RuntimeError( + 'Failed to fetch controller IP. Please refresh controller ' + f'status by `sky status -r {serve.SKY_SERVE_CONTROLLER_NAME}` ' + 'and try again.') from e + + try: + subprocess_utils.handle_returncode(returncode, code, + 'Failed to terminate service', + stdout) + except exceptions.CommandError as e: + raise RuntimeError(e.error_msg) from e + + sky_logging.print(stdout) @usage_lib.entrypoint @@ -1108,7 +1216,8 @@ def serve_tail_logs( raise ValueError('`replica_id` must be None when using ' 'target=CONTROLLER/LOAD_BALANCER.') controller_status, handle = backend_utils.is_controller_up( - is_spot=False, stopped_message='No service is found.') + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + stopped_message='No service is found.') if handle is None or handle.head_ip is None: msg = 'No service is found.' if controller_status == status_lib.ClusterStatus.INIT: @@ -1122,68 +1231,3 @@ def serve_tail_logs( target, replica_id, follow=follow) - - -@usage_lib.entrypoint -# pylint: disable=redefined-builtin -def serve_down(service_names: Optional[Union[str, List[str]]] = None, - all: bool = False) -> None: - """Teardown a service. - - Please refer to the sky.cli.serve_down for the docs. - - Args: - service_names: Name of the service(s). - - Raises: - sky.exceptions.ClusterNotUpError: if the sky serve controller is not up. - ValueError: if the arguments are invalid. - RuntimeError: if failed to terminate the service. - """ - if service_names is None: - service_names = [] - if isinstance(service_names, str): - service_names = [service_names] - cluster_status, handle = backend_utils.is_controller_up( - is_spot=False, stopped_message='All services should have terminated.') - if handle is None or handle.head_ip is None: - # The error message is already printed in backend_utils.is_controller_up - # TODO(zhwu): Move the error message into the exception. - with ux_utils.print_exception_no_traceback(): - raise exceptions.ClusterNotUpError('', - cluster_status=cluster_status) - - service_names_str = ','.join(service_names) - if sum([len(service_names) > 0, all]) != 1: - argument_str = f'service_names={service_names_str}' if len( - service_names) > 0 else '' - argument_str += ' all' if all else '' - raise ValueError('Can only specify one of service_names or all. ' - f'Provided {argument_str!r}.') - - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend) - if all: - code = serve.ServeCodeGen.terminate_services(None) - else: - code = serve.ServeCodeGen.terminate_services(service_names) - - try: - returncode, stdout, _ = backend.run_on_head(handle, - code, - require_outputs=True, - stream_logs=False) - except exceptions.FetchIPError as e: - raise RuntimeError( - 'Failed to fetch controller IP. Please refresh controller ' - f'status by `sky status -r {serve.SKY_SERVE_CONTROLLER_NAME}` ' - 'and try again.') from e - - try: - subprocess_utils.handle_returncode(returncode, code, - 'Failed to terminate service', - stdout) - except exceptions.CommandError as e: - raise RuntimeError(e.error_msg) from e - - sky_logging.print(stdout) diff --git a/sky/execution.py b/sky/execution.py index 98c69a646e6..946677e1fcc 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -370,9 +370,8 @@ def _execute( backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: - group = backend_utils.ReservedClusterGroup.check_cluster_name( - cluster_name) - if group is None and not _is_launched_by_sky_serve_controller: + controller = backend_utils.Controllers.check_cluster_name(cluster_name) + if controller is None and not _is_launched_by_sky_serve_controller: # UX: print live clusters to make users aware (to save costs). # # Don't print if this job is launched by the spot controller, @@ -969,8 +968,9 @@ def _register_service_name(service_name: str) -> bool: True if the service name is registered successfully, False otherwise. """ with sky_logging.silent(): - _, handle = backend_utils.is_controller_up(is_spot=False, - stopped_message='') + _, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + stopped_message='') if handle is None or handle.head_ip is None: # The sky serve controller is STOPPED, or it is the first time # provisioning either after an AUTOSTOP, or the first time the From cbfa389aea62fa91e5d7493d2684d5de550dbd6d Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 24 Oct 2023 11:08:14 -0700 Subject: [PATCH 152/223] upd schema --- sky/serve/service_spec.py | 6 +++--- sky/utils/schemas.py | 34 ++++++++++++++++++---------------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index fecbceb6a1d..69c21ec1c11 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -6,8 +6,8 @@ import yaml -from sky.backends import backend_utils from sky.serve import constants +from sky.utils import common_utils from sky.utils import schemas from sky.utils import ux_utils @@ -57,8 +57,8 @@ def __init__( @staticmethod def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': - backend_utils.validate_schema(config, schemas.get_service_schema(), - 'Invalid service YAML: ') + common_utils.validate_schema(config, schemas.get_service_schema(), + 'Invalid service YAML: ') if 'replicas' in config and 'replica_policy' in config: with ux_utils.print_exception_no_traceback(): raise ValueError( diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 2a8e117333d..51a0a55620f 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -157,7 +157,7 @@ def get_storage_schema(): def get_service_schema(): return { - '$schema': 'http://json-schema.org/draft-07/schema#', + '$schema': 'https://json-schema.org/draft/2020-12/schema', 'type': 'object', 'required': ['port', 'readiness_probe'], 'additionalProperties': False, @@ -333,32 +333,34 @@ def get_cluster_schema(): def get_config_schema(): # pylint: disable=import-outside-toplevel from sky.utils import kubernetes_enums - return { - '$schema': 'https://json-schema.org/draft/2020-12/schema', + controller_resources_schema = { 'type': 'object', 'required': [], 'additionalProperties': False, 'properties': { - 'spot': { + 'controller': { 'type': 'object', 'required': [], 'additionalProperties': False, 'properties': { - 'controller': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'resources': { - k: v - for k, v in get_resources_schema().items() - # Validation may fail if $schema is included. - if k != '$schema' - }, - } + 'resources': { + k: v + for k, v in get_resources_schema().items() + # Validation may fail if $schema is included. + if k != '$schema' }, } }, + } + } + return { + '$schema': 'https://json-schema.org/draft/2020-12/schema', + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'spot': controller_resources_schema, + 'serve': controller_resources_schema, 'aws': { 'type': 'object', 'required': [], From 3bd6c8c5d6b1b12be91c5ce03f6b2d85fd68242c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 26 Oct 2023 22:53:52 -0700 Subject: [PATCH 153/223] change to async function for fastapi --- sky/serve/controller.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 4ac718ba042..f2fc790da8f 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -2,7 +2,6 @@ Responsible for autoscaling and replica management. """ -import asyncio import base64 import logging import pickle @@ -82,8 +81,8 @@ def _run_autoscaler(self): def run(self) -> None: @self.app.post('/controller/load_balancer_sync') - def load_balancer_sync(request: fastapi.Request): - request_data = asyncio.run(request.json()) + async def load_balancer_sync(request: fastapi.Request): + request_data = await request.json() request_information_payload = request_data.get( 'request_information') request_information = pickle.loads( From f97bd5b37f1aac14e2acecf3ce9e29d3be02dd54 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 1 Nov 2023 14:51:40 -0700 Subject: [PATCH 154/223] add multiple ports TODO --- sky/serve/service.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/serve/service.py b/sky/serve/service.py index 224acdf534a..9cb0e3a4987 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -165,6 +165,9 @@ def _start(service_name: str, task_yaml: str, job_id: int): constants.LOAD_BALANCER_PORT_START) # Start the load balancer. + # TODO(tian): Probably we could enable multiple ports specified in + # service spec and we could start multiple load balancers. + # After that, we will have a mapping from replica port to endpoint. load_balancer_process = multiprocessing.Process( target=serve_utils.RedirectOutputTo( load_balancer.run_load_balancer, From 8773d0c722adf1e1ec22c4cd210d6c84c71b278b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 1 Nov 2023 19:57:16 -0700 Subject: [PATCH 155/223] fix outdated example --- examples/serve/misc/cancel/service.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/serve/misc/cancel/service.yaml b/examples/serve/misc/cancel/service.yaml index ab736649f8c..f6281504f99 100644 --- a/examples/serve/misc/cancel/service.yaml +++ b/examples/serve/misc/cancel/service.yaml @@ -3,8 +3,6 @@ service: readiness_probe: path: /health initial_delay_seconds: 120 - controller_resources: - cpus: 2+ resources: cpus: 2+ From 2008590506e8135b4e57617c65e99e3cfa8537fd Mon Sep 17 00:00:00 2001 From: Ziming Mao Date: Thu, 2 Nov 2023 13:09:12 -0700 Subject: [PATCH 156/223] [SkyServe] Serving with Spot (#2749) * rebase and fix bugs * fix PR reviews * fix * fix comments * rename tests * fix yaml replica_num --- sky/serve/controller.py | 4 +- sky/serve/replica_managers.py | 63 +++++++++++++++++++++++++++-- sky/serve/serve_state.py | 4 ++ sky/utils/cli_utils/status_utils.py | 8 ++-- tests/skyserve/spot/recovery.yaml | 17 ++++++++ tests/skyserve/spot/user_bug.py | 6 +++ tests/skyserve/spot/user_bug.yaml | 16 ++++++++ tests/test_smoke.py | 56 +++++++++++++++++++++++++ 8 files changed, 165 insertions(+), 9 deletions(-) create mode 100644 tests/skyserve/spot/recovery.yaml create mode 100644 tests/skyserve/spot/user_bug.py create mode 100644 tests/skyserve/spot/user_bug.yaml diff --git a/sky/serve/controller.py b/sky/serve/controller.py index f2fc790da8f..22d7fd9650f 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -41,8 +41,8 @@ def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec, task_yaml: str, port: int) -> None: self.service_name = service_name self.replica_manager: replica_managers.ReplicaManager = ( - replica_managers.SkyPilotReplicaManager(service_name, - service_spec, + replica_managers.SkyPilotReplicaManager(service_name=service_name, + spec=service_spec, task_yaml_path=task_yaml)) self.autoscaler: autoscalers.Autoscaler = ( autoscalers.RequestRateAutoscaler( diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index d75829b0176..a2e253bc2d8 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -18,6 +18,7 @@ from sky import exceptions from sky import global_user_state from sky import sky_logging +from sky import status_lib from sky.backends import backend_utils from sky.serve import constants as serve_constants from sky.serve import serve_state @@ -169,6 +170,8 @@ class ReplicaStatusProperty: first_ready_time: Optional[float] = None # None means sky.down is not called yet. sky_down_status: Optional[ProcessStatus] = None + # The replica's spot instance was preempted. + preempted: bool = False def is_scale_down_succeeded(self, initial_delay_seconds: int, auto_restart: bool) -> bool: @@ -193,6 +196,8 @@ def is_scale_down_succeeded(self, initial_delay_seconds: int, return True if self.user_app_failed: return False + if self.preempted: + return True if not self.service_ready_now: return False return self.first_ready_time is not None @@ -204,6 +209,8 @@ def should_track_status(self) -> bool: return False if self.user_app_failed: return False + if self.preempted: + return True return True def to_replica_status(self) -> serve_state.ReplicaStatus: @@ -211,6 +218,9 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: # Still launching return serve_state.ReplicaStatus.PROVISIONING if self.sky_down_status is not None: + if self.preempted: + # Replica (spot) is preempted + return serve_state.ReplicaStatus.PREEMPTED if self.sky_down_status == ProcessStatus.RUNNING: # sky.down is running return serve_state.ReplicaStatus.SHUTTING_DOWN @@ -472,6 +482,8 @@ def _sync_down_logs(): info = serve_state.get_replica_info_from_id(self.service_name, replica_id) assert info is not None + logger.info(f'preempted: {info.status_property.preempted}, ' + f'replica_id: {replica_id}') log_file_name = serve_utils.generate_replica_down_log_file_name( self.service_name, replica_id) p = multiprocessing.Process( @@ -490,6 +502,16 @@ def scale_down(self, replica_ids: List[int]) -> None: for replica_id in replica_ids: self._terminate_replica(replica_id, sync_down_logs=False) + def _recover_from_preemption(self, replica_id: int) -> None: + logger.info(f'Beginning recovery for preempted replica {replica_id}.') + # TODO(MaoZiming): Support spot recovery policies + info = serve_state.get_replica_info_from_id(self.service_name, + replica_id) + assert info is not None + info.status_property.preempted = True + serve_state.add_or_update_replica(self.service_name, replica_id, info) + self._terminate_replica(replica_id, sync_down_logs=False) + ################################# # ReplicaManager Daemon Threads # ################################# @@ -557,11 +579,16 @@ def _refresh_process_pool(self) -> None: if info.status_property.is_scale_down_succeeded( self.initial_delay_seconds, self.auto_restart): # This means the cluster is deleted due to - # a scale down. Delete the replica info + # a scale down or the cluster is recovering + # from preemption. Delete the replica info # so it won't count as a replica. - logger.info(f'Replica {replica_id} removed from the ' - 'replica table normally.') serve_state.remove_replica(self.service_name, replica_id) + if info.status_property.preempted: + removal_reason = 'for preemption recovery' + else: + removal_reason = 'normally' + logger.info(f'Replica {replica_id} removed from the ' + f'replica table {removal_reason}.') else: logger.info(f'Termination of replica {replica_id} ' 'finished. Replica info is kept since some ' @@ -673,6 +700,36 @@ def _probe_all_replicas(self) -> None: if info.status_property.first_ready_time is None: info.status_property.first_ready_time = probe_time else: + handle = info.handle + if handle is None: + logger.error('Cannot find handle for ' + f'replica {info.replica_id}.') + elif handle.launched_resources is None: + logger.error('Cannot find launched_resources in handle' + f' for replica {info.replica_id}.') + elif handle.launched_resources.use_spot: + # Pull the actual cluster status + # from the cloud provider to + # determine whether the cluster is preempted. + (cluster_status, + _) = backends.backend_utils.refresh_cluster_status_handle( + info.cluster_name, + force_refresh_statuses=set(status_lib.ClusterStatus)) + + if cluster_status != status_lib.ClusterStatus.UP: + # The cluster is (partially) preempted. + # It can be down, INIT or STOPPED, based on the + # interruption behavior of the cloud. + # Spot recovery is needed. + cluster_status_str = ( + '' if cluster_status is None else + f' (status: {cluster_status.value})') + logger.info(f'Replica {info.replica_id} ' + f'is preempted{cluster_status_str}.') + self._recover_from_preemption(info.replica_id) + + continue + if info.first_not_ready_time is None: info.first_not_ready_time = probe_time if info.status_property.first_ready_time is not None: diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 418862e5908..ab4e772f7e4 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -81,6 +81,9 @@ class ReplicaStatus(enum.Enum): # we should guarantee no resource leakage like regular sky. FAILED_CLEANUP = 'FAILED_CLEANUP' + # The replica is a spot VM and it is preempted by the cloud provider. + PREEMPTED = 'PREEMPTED' + # Unknown status. This should never happen. UNKNOWN = 'UNKNOWN' @@ -101,6 +104,7 @@ def colored_str(self) -> str: ReplicaStatus.FAILED_CLEANUP: colorama.Fore.RED, ReplicaStatus.SHUTTING_DOWN: colorama.Fore.MAGENTA, ReplicaStatus.FAILED: colorama.Fore.RED, + ReplicaStatus.PREEMPTED: colorama.Fore.MAGENTA, ReplicaStatus.UNKNOWN: colorama.Fore.RED, } diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 0db43300217..7cbca6e0738 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -491,14 +491,14 @@ def _get_replica_resources(replica_record: _ReplicaRecord) -> str: return '-' assert isinstance(handle, backends.CloudVmRayResourceHandle) cloud = handle.launched_resources.cloud - launched_resource_str = f'{cloud}' if handle.launched_resources.accelerators is None: vcpu, _ = cloud.get_vcpus_mem_from_instance_type( handle.launched_resources.instance_type) - launched_resource_str += f'(vCPU={int(vcpu)})' + hardware = f'vCPU={int(vcpu)}' else: - launched_resource_str += f'({handle.launched_resources.accelerators})' - resources_str = (f'{handle.launched_nodes}x {launched_resource_str}') + hardware = f'{handle.launched_resources.accelerators})' + spot = '[Spot]' if handle.launched_resources.use_spot else '' + resources_str = f'{handle.launched_nodes}x {cloud}({spot}{hardware})' return resources_str diff --git a/tests/skyserve/spot/recovery.yaml b/tests/skyserve/spot/recovery.yaml new file mode 100644 index 00000000000..3eefd1bca8a --- /dev/null +++ b/tests/skyserve/spot/recovery.yaml @@ -0,0 +1,17 @@ +resources: + cloud: gcp + cpus: 2+ + zone: us-central1-a + use_spot: true + +workdir: examples/serve/http_server + +# Use 8080 to test jupyter service is terminated +run: python3 server.py --port 8080 + +service: + port: 8080 + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 1 diff --git a/tests/skyserve/spot/user_bug.py b/tests/skyserve/spot/user_bug.py new file mode 100644 index 00000000000..61a8c166316 --- /dev/null +++ b/tests/skyserve/spot/user_bug.py @@ -0,0 +1,6 @@ +import time + +# The program exits to simulate a user app bug. +if __name__ == "__main__": + time.sleep(30) + assert False diff --git a/tests/skyserve/spot/user_bug.yaml b/tests/skyserve/spot/user_bug.yaml new file mode 100644 index 00000000000..da1e525afc2 --- /dev/null +++ b/tests/skyserve/spot/user_bug.yaml @@ -0,0 +1,16 @@ +resources: + cloud: gcp + cpus: 2+ + zone: us-central1-a + use_spot: True + +workdir: tests/skyserve/spot + +run: python3 user_bug.py + +service: + port: 8080 + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 1 diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 6eda5b5feea..8c8fbb3acaa 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2734,6 +2734,62 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: run_one_test(test) +@pytest.mark.gcp +@pytest.mark.sky_serve +def test_skyserve_spot_recovery(): + name = _get_service_name() + zone = 'us-central1-a' + + # Reference: test_spot_recovery_gcp + def terminate_replica(replica_id: int) -> str: + cluster_name = serve.generate_replica_cluster_name(name, replica_id) + query_cmd = (f'gcloud compute instances list --filter=' + f'"(labels.ray-cluster-name:{cluster_name})" ' + f'--zones={zone} --format="value(name)"') + return (f'gcloud compute instances delete --zone={zone}' + f' --quiet $({query_cmd})') + + test = Test( + f'test-skyserve-spot-recovery-gcp', + [ + f'sky serve up -n {name} -y tests/skyserve/spot/recovery.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', + terminate_replica(1), + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', + ], + f'sky serve down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.gcp +@pytest.mark.sky_serve +def test_skyserve_spot_user_bug(): + """Tests that spot recovery doesn't occur for non-preemption failures""" + name = _get_service_name() + test = Test( + f'test-skyserve-spot-user-bug-gcp', + [ + f'sky serve up -n {name} -y tests/skyserve/spot/user_bug.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + # After failure due to user bug, the service should fail instead of + # triggering spot recovery. + '(while true; do' + f' output=$(sky serve status {name});' + ' echo "$output" | grep -q "FAILED" && break;' + ' echo "$output" | grep -q "PROVISIONING" && exit 1;' + ' sleep 10;' + f'done)', + ], + f'sky serve down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + @pytest.mark.gcp @pytest.mark.sky_serve def test_skyserve_replica_failure(): From ad404728e5c976755e5b774b2432f884cdc3c7e2 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 2 Nov 2023 17:25:34 -0700 Subject: [PATCH 157/223] fix sky status pool wait --- sky/cli.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 628dde0a10a..1d0d6faa14e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1915,17 +1915,6 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: # down, and the hint for showing sky spot queue # will still be shown. success = False - - try: - pool.close() - pool.join() - except SystemExit as e: - # This is to avoid a "Exception ignored" problem caused by - # ray worker setting the sigterm handler to sys.exit(15) - # (see ray/_private/worker.py). - # TODO (zhwu): Remove any importing of ray in SkyPilot. - if e.code != 15: - raise return success, result spot_jobs_success = True @@ -1973,7 +1962,19 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: success, msg = _try_get_future_result(services_future) if not success: msg = 'KeyboardInterrupt' - click.echo(msg) + click.echo(msg) + + if show_spot_jobs or show_services: + try: + pool.close() + pool.join() + except SystemExit as e: + # This is to avoid a "Exception ignored" problem caused by + # ray worker setting the sigterm handler to sys.exit(15) + # (see ray/_private/worker.py). + # TODO (zhwu): Remove any importing of ray in SkyPilot. + if e.code != 15: + raise if num_pending_autostop > 0 and not refresh: # Don't print this hint if there's no pending autostop or user has From b5557bb32e1138e9e6090c4c245c8eb07176a6a5 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 3 Nov 2023 16:06:14 -0700 Subject: [PATCH 158/223] fix sync down logs failed --- sky/serve/replica_managers.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index a2e253bc2d8..e0545e959fa 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -5,6 +5,7 @@ import functools import multiprocessing import os +import pathlib import threading import time import traceback @@ -445,13 +446,10 @@ def _terminate_replica(self, replica_id: int, sync_down_logs: bool) -> None: 'already exists. Skipping.') return - def _sync_down_logs(): - info = serve_state.get_replica_info_from_id(self.service_name, - replica_id) - if info is None: - logger.error(f'Cannot find replica {replica_id} in the ' - 'replica table. Skipping syncing down logs.') - return + def _sync_down_logs(info: ReplicaInfo): + local_log_file_name = ( + serve_utils.generate_replica_local_log_file_name( + self.service_name, replica_id)) logger.info(f'Syncing down logs for replica {replica_id}...') backend = backends.CloudVmRayBackend() handle = global_user_state.get_handle_from_cluster_name( @@ -460,7 +458,15 @@ def _sync_down_logs(): logger.error(f'Cannot find cluster {info.cluster_name} for ' f'replica {replica_id} in the cluster table. ' 'Skipping syncing down logs.') + # Create an empty file to indicate that we have tried to + # sync down logs. There is a small possibility that the + # launch process is still running and the cluster is not + # created yet; or the launch process failed before the + # cluster record is created. In this case, we will not + # be able to sync down logs. + pathlib.Path(local_log_file_name).touch() return + assert isinstance(handle, backends.CloudVmRayResourceHandle) replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, 'replica_jobs') log_file = backend_utils.download_and_stream_latest_job_log( @@ -470,18 +476,18 @@ def _sync_down_logs(): log_position_hint='replica cluster', log_finish_hint=f'Replica: {replica_id}') if log_file is not None: - local_log_file_name = ( - serve_utils.generate_replica_local_log_file_name( - self.service_name, replica_id)) os.rename(log_file, local_log_file_name) - - if sync_down_logs: - _sync_down_logs() + else: + pathlib.Path(local_log_file_name).touch() logger.info(f'Terminating replica {replica_id}...') info = serve_state.get_replica_info_from_id(self.service_name, replica_id) assert info is not None + + if sync_down_logs: + _sync_down_logs(info) + logger.info(f'preempted: {info.status_property.preempted}, ' f'replica_id: {replica_id}') log_file_name = serve_utils.generate_replica_down_log_file_name( From 8414ea760edeb33956190412158c0532edbcc7df Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 3 Nov 2023 17:03:29 -0700 Subject: [PATCH 159/223] upd examples --- examples/serve/gorilla/gorilla.yaml | 10 ++++++++++ examples/serve/hello_skyserve/hello_skyserve.yaml | 10 ---------- examples/serve/hello_skyserve/index.html | 11 ----------- examples/serve/http_server/task.yaml | 8 ++++++++ examples/serve/llama2/llama2.yaml | 12 ++++++++++++ examples/serve/misc/cancel/service.yaml | 2 ++ examples/serve/ray_serve/ray_serve.yaml | 5 +++++ examples/serve/stable_diffusion_service.yaml | 7 +++++-- examples/serve/tgi_coder.yaml | 8 ++++++++ examples/serve/vicuna-v1.5.yaml | 8 ++++++++ examples/serve/vllm.yaml | 8 ++++++++ 11 files changed, 66 insertions(+), 23 deletions(-) delete mode 100644 examples/serve/hello_skyserve/hello_skyserve.yaml delete mode 100644 examples/serve/hello_skyserve/index.html diff --git a/examples/serve/gorilla/gorilla.yaml b/examples/serve/gorilla/gorilla.yaml index 10d2976093e..5549b7156dd 100644 --- a/examples/serve/gorilla/gorilla.yaml +++ b/examples/serve/gorilla/gorilla.yaml @@ -1,3 +1,13 @@ +# SkyServe YAML to run gorilla LLM. +# +# Usage: +# sky serve up -n gorilla examples/serve/gorilla/gorilla.yaml +# Then go to the examples/serve/gorilla/use_gorilla.ipynb +# and follow the instructions there. +# The endpoint will be printed in the console. You +# could also check the endpoint by running: +# sky serve status --endpoint gorilla + service: port: 8087 readiness_probe: diff --git a/examples/serve/hello_skyserve/hello_skyserve.yaml b/examples/serve/hello_skyserve/hello_skyserve.yaml deleted file mode 100644 index 1e76fc24a12..00000000000 --- a/examples/serve/hello_skyserve/hello_skyserve.yaml +++ /dev/null @@ -1,10 +0,0 @@ -service: - port: 8080 - readiness_probe: / - -resources: - cpus: 2+ - -workdir: . - -run: python3 -m http.server 8080 diff --git a/examples/serve/hello_skyserve/index.html b/examples/serve/hello_skyserve/index.html deleted file mode 100644 index 6a7649deacc..00000000000 --- a/examples/serve/hello_skyserve/index.html +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - Hello, SkyServe! - - -

Hello, SkyServe!

- - diff --git a/examples/serve/http_server/task.yaml b/examples/serve/http_server/task.yaml index 3c4d9046b45..e18e0ff9c1b 100644 --- a/examples/serve/http_server/task.yaml +++ b/examples/serve/http_server/task.yaml @@ -1,3 +1,11 @@ +# SkyServe YAML to run a simple http server. +# +# Usage: +# sky serve up -n http examples/serve/http_server/task.yaml +# The endpoint will be printed in the console. You +# could also check the endpoint by running: +# sky serve status --endpoint http + service: port: 8081 readiness_probe: diff --git a/examples/serve/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml index cb6a70c1adb..54ff06d67fc 100644 --- a/examples/serve/llama2/llama2.yaml +++ b/examples/serve/llama2/llama2.yaml @@ -1,3 +1,15 @@ +# SkyServe YAML to run Llama2 LLM. +# +# Usage: replace the with +# your huggingface token, and run: +# sky serve up -n llama2 examples/serve/llama2/llama2.yaml +# Then run the following command to interact with +# the model: +# python3 examples/serve/llama2/chat.py +# The endpoint will be printed in the console. You +# could also check the endpoint by running: +# sky serve status --endpoint llama2 + service: port: 8087 readiness_probe: /v1/models diff --git a/examples/serve/misc/cancel/service.yaml b/examples/serve/misc/cancel/service.yaml index f6281504f99..1da3fb9f086 100644 --- a/examples/serve/misc/cancel/service.yaml +++ b/examples/serve/misc/cancel/service.yaml @@ -1,3 +1,5 @@ +# Usage: Please refer to the README.md in this directory. + service: port: 9000 readiness_probe: diff --git a/examples/serve/ray_serve/ray_serve.yaml b/examples/serve/ray_serve/ray_serve.yaml index c47c8b74be1..efcffb37110 100644 --- a/examples/serve/ray_serve/ray_serve.yaml +++ b/examples/serve/ray_serve/ray_serve.yaml @@ -1,3 +1,8 @@ +# SkyServe YAML to run a simple rayserve endpoint. +# +# Usage: +# sky serve up examples/serve/ray_serve/ray_serve.yaml + service: port: 8000 readiness_probe: / diff --git a/examples/serve/stable_diffusion_service.yaml b/examples/serve/stable_diffusion_service.yaml index 405b3bc7407..c3671fa29dd 100644 --- a/examples/serve/stable_diffusion_service.yaml +++ b/examples/serve/stable_diffusion_service.yaml @@ -1,7 +1,10 @@ -# SkyPilot YAML to run stable diffusion web tool on 1 V100 GPU. +# SkyServe YAML to run stable diffusion web tool. # # Usage: -# .. +# sky serve up -n sd examples/serve/stable_diffusion_service.yaml +# Then visit the endpoint printed in the console. You could also +# check the endpoint by running: +# sky serve status --endpoint sd service: port: 7860 diff --git a/examples/serve/tgi_coder.yaml b/examples/serve/tgi_coder.yaml index 71f109d1179..4363f0cf60c 100644 --- a/examples/serve/tgi_coder.yaml +++ b/examples/serve/tgi_coder.yaml @@ -1,3 +1,11 @@ +# SkyServe YAML to run HuggingFace TGI with WizardLM/WizardCoder-15B-V1.0. +# +# Usage: +# sky serve up -n tgi examples/serve/tgi_coder.yaml +# Then visit the endpoint printed in the console. You could also +# check the endpoint by running: +# sky serve status --endpoint tgi + service: port: 8082 readiness_probe: /health diff --git a/examples/serve/vicuna-v1.5.yaml b/examples/serve/vicuna-v1.5.yaml index 6ca32de5ba8..8b81b014cee 100644 --- a/examples/serve/vicuna-v1.5.yaml +++ b/examples/serve/vicuna-v1.5.yaml @@ -1,3 +1,11 @@ +# SkyServe YAML to run Vicuna LLM. +# +# Usage: +# sky serve up -n vicuna examples/serve/vicuna-v1.5.yaml +# The endpoint will be printed in the console. You +# could also check the endpoint by running: +# sky serve status --endpoint vicuna + service: port: 8087 readiness_probe: /v1/models diff --git a/examples/serve/vllm.yaml b/examples/serve/vllm.yaml index 01f63ccc33e..de2e16225bc 100644 --- a/examples/serve/vllm.yaml +++ b/examples/serve/vllm.yaml @@ -1,3 +1,11 @@ +# SkyServe YAML to run vLLM with Vicuna LLM. +# +# Usage: +# sky serve up -n vllm examples/serve/vllm.yaml +# The endpoint will be printed in the console. You +# could also check the endpoint by running: +# sky serve status --endpoint vllm + service: port: 8081 readiness_probe: From f6c3d70e2f73458f31f4df4420c00993c9fe77da Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 3 Nov 2023 23:58:24 -0700 Subject: [PATCH 160/223] add gorilla notebook --- examples/serve/gorilla/run_gorilla.py | 43 ----------- examples/serve/gorilla/use_gorilla.ipynb | 91 ++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 43 deletions(-) delete mode 100644 examples/serve/gorilla/run_gorilla.py create mode 100644 examples/serve/gorilla/use_gorilla.ipynb diff --git a/examples/serve/gorilla/run_gorilla.py b/examples/serve/gorilla/run_gorilla.py deleted file mode 100644 index 1d41106e5cf..00000000000 --- a/examples/serve/gorilla/run_gorilla.py +++ /dev/null @@ -1,43 +0,0 @@ -# Code is borrowed from gorilla's colab -# https://colab.research.google.com/drive/1DEBPsccVLF_aUnmD0FwPeHFrtdC0QIUP?usp=sharing # pylint: disable=line-too-long - -import urllib.parse - -import openai - -openai.api_key = 'EMPTY' # Key is ignored and does not matter -# SkyServe endpoint -endpoint = input('Enter SkyServe endpoint: ') -# endpoint = '34.132.127.197:8000' -openai.api_base = f'http://{endpoint}/v1' - - -# Report issues -def raise_issue(e, model, prompt): - issue_title = urllib.parse.quote('[bug] Hosted Gorilla: ') - issue_body = urllib.parse.quote( - f'Exception: {e}\nFailed model: {model}, for prompt: {prompt}') - issue_url = f'https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-gorilla&projects=&template=hosted-gorilla-.md&title={issue_title}&body={issue_body}' - print( - f'An exception has occurred: {e} \nPlease raise an issue here: {issue_url}' - ) - - -# Query Gorilla server -def get_gorilla_response(prompt, model='gorilla-mpt-7b-hf-v0'): - try: - completion = openai.ChatCompletion.create(model=model, - messages=[{ - 'role': 'user', - 'content': prompt - }]) - return completion.choices[0].message.content - except Exception as e: - raise_issue(e, model, prompt) - - -prompt = 'I would like to translate "I feel very good today." from English to Chinese.' -print(get_gorilla_response(prompt)) - -prompt = 'I want to build a robot that can detecting objects in an image ‘cat.jpeg’. Input: [‘cat.jpeg’]' -print(get_gorilla_response(prompt)) diff --git a/examples/serve/gorilla/use_gorilla.ipynb b/examples/serve/gorilla/use_gorilla.ipynb new file mode 100644 index 00000000000..a7e37530d82 --- /dev/null +++ b/examples/serve/gorilla/use_gorilla.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SkyServe Gorilla Playground\n", + "\n", + "Welcome! Here is the sky serve gorilla playground. You can use this notebook to test your gorilla service.\n", + "\n", + "This notebook is borrowed from [gorilla's colab](https://colab.research.google.com/drive/1DEBPsccVLF_aUnmD0FwPeHFrtdC0QIUP?usp=sharing).\n", + "\n", + "To use this notebook, run `sky serve up examples/serve/gorilla/gorilla.yaml` first and paste the endpoint below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sky_serve_endpoint = '' # Enter your sky serve endpoint here" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then run the cell below to test your service!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install openai &> /dev/null\n", + "import openai\n", + "import urllib.parse\n", + "\n", + "openai.api_key = \"EMPTY\" # Key is ignored and does not matter\n", + "openai.api_base = f\"http://{sky_serve_endpoint}/v1\"\n", + "\n", + "# Report issues\n", + "def raise_issue(e, model, prompt):\n", + " issue_title = urllib.parse.quote(\"[bug] Hosted Gorilla: \")\n", + " issue_body = urllib.parse.quote(f\"Exception: {e}\\nFailed model: {model}, for prompt: {prompt}\")\n", + " issue_url = f\"https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-gorilla&projects=&template=hosted-gorilla-.md&title={issue_title}&body={issue_body}\"\n", + " print(f\"An exception has occurred: {e} \\nPlease raise an issue here: {issue_url}\")\n", + "\n", + "# Query Gorilla server\n", + "def get_gorilla_response(prompt=\"I would like to translate from English to French.\", model=\"gorilla-7b-hf-v1\"):\n", + " try:\n", + " completion = openai.ChatCompletion.create(\n", + " model=model,\n", + " messages=[{\"role\": \"user\", \"content\": prompt}]\n", + " )\n", + " return completion.choices[0].message.content\n", + " except Exception as e:\n", + " raise_issue(e, model, prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Gorilla `gorilla-mpt-7b-hf-v1` with code snippets\n", + "# Translation\n", + "prompt = \"I would like to translate 'I feel very good today.' from English to Chinese.\"\n", + "print(get_gorilla_response(prompt, model=\"gorilla-7b-hf-v1\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Gorilla `gorilla-7b-hf-v1` with code snippets\n", + "# Object Detection\n", + "prompt = \"I want to build a robot that can detecting objects in an image ‘cat.jpeg’. Input: [‘cat.jpeg’]\"\n", + "print(get_gorilla_response(prompt, model=\"gorilla-7b-hf-v1\"))" + ] + } + ], + "nbformat": 4, + "nbformat_minor": 2 +} From d0365606fa7db82a75bed9d9574f3b9051426b80 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 5 Nov 2023 15:57:00 -0800 Subject: [PATCH 161/223] add todo for customizable setup commands --- sky/execution.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/execution.py b/sky/execution.py index 946677e1fcc..9b3e54f9b1d 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1078,6 +1078,7 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: backend_utils.fill_template(serve.CONTROLLER_TEMPLATE, vars_to_fill, output_path=controller_file.name) + # TODO(tian): Probably we should support customizable setup commands. controller_task = task_lib.Task.from_yaml(controller_file.name) controller_task.set_resources(controller_resources) From 8327a2e7de1b695b99066cc80ed56040a1f2de63 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 7 Nov 2023 10:43:19 -0800 Subject: [PATCH 162/223] add launch log to streaming --- sky/serve/replica_managers.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index e0545e959fa..dbfa3514401 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -5,7 +5,6 @@ import functools import multiprocessing import os -import pathlib import threading import time import traceback @@ -446,10 +445,18 @@ def _terminate_replica(self, replica_id: int, sync_down_logs: bool) -> None: 'already exists. Skipping.') return - def _sync_down_logs(info: ReplicaInfo): + def _download_and_stream_logs(info: ReplicaInfo): + launch_log_file_name = ( + serve_utils.generate_replica_launch_log_file_name( + self.service_name, replica_id)) local_log_file_name = ( serve_utils.generate_replica_local_log_file_name( self.service_name, replica_id)) + # Write launch log to local log file + with open(local_log_file_name, + 'w') as local_file, open(launch_log_file_name, + 'r') as launch_file: + local_file.write(launch_file.read()) logger.info(f'Syncing down logs for replica {replica_id}...') backend = backends.CloudVmRayBackend() handle = global_user_state.get_handle_from_cluster_name( @@ -458,13 +465,6 @@ def _sync_down_logs(info: ReplicaInfo): logger.error(f'Cannot find cluster {info.cluster_name} for ' f'replica {replica_id} in the cluster table. ' 'Skipping syncing down logs.') - # Create an empty file to indicate that we have tried to - # sync down logs. There is a small possibility that the - # launch process is still running and the cluster is not - # created yet; or the launch process failed before the - # cluster record is created. In this case, we will not - # be able to sync down logs. - pathlib.Path(local_log_file_name).touch() return assert isinstance(handle, backends.CloudVmRayResourceHandle) replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, @@ -476,9 +476,9 @@ def _sync_down_logs(info: ReplicaInfo): log_position_hint='replica cluster', log_finish_hint=f'Replica: {replica_id}') if log_file is not None: - os.rename(log_file, local_log_file_name) - else: - pathlib.Path(local_log_file_name).touch() + with open(local_log_file_name, + 'a') as local_file, open(log_file, 'r') as job_file: + local_file.write(job_file.read()) logger.info(f'Terminating replica {replica_id}...') info = serve_state.get_replica_info_from_id(self.service_name, @@ -486,7 +486,7 @@ def _sync_down_logs(info: ReplicaInfo): assert info is not None if sync_down_logs: - _sync_down_logs(info) + _download_and_stream_logs(info) logger.info(f'preempted: {info.status_property.preempted}, ' f'replica_id: {replica_id}') From 6f3f8093fbd62907dda5b983211203a31630fcb2 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 7 Nov 2023 10:45:57 -0800 Subject: [PATCH 163/223] move comment position --- sky/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 1d0d6faa14e..b2a748a3440 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1911,9 +1911,6 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: result = future.get() except KeyboardInterrupt: pool.terminate() - # Set to -1, so that the controller is not considered - # down, and the hint for showing sky spot queue - # will still be shown. success = False return success, result @@ -1927,6 +1924,9 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: if spot_jobs_success: num_in_progress_jobs, msg = result else: + # Set to -1, so that the controller is not considered + # down, and the hint for showing sky spot queue + # will still be shown. num_in_progress_jobs = -1 msg = 'KeyboardInterrupt' From f52ebb0a00ee9c730bfbc8b54d136c4ba77578ee Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 7 Nov 2023 13:30:51 -0800 Subject: [PATCH 164/223] catch error and print log --- sky/serve/replica_managers.py | 11 +++++++++-- sky/serve/serve_utils.py | 10 +++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index dbfa3514401..64a2d6e7058 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -53,7 +53,14 @@ def launch_cluster(task_yaml_path: str, or some error happened before provisioning and will happen again if retry. """ - task = sky.Task.from_yaml(task_yaml_path) + try: + task = sky.Task.from_yaml(task_yaml_path) + except Exception as e: # pylint: disable=broad-except + logger.error('Failed to construct task object from yaml file with ' + f'error {common_utils.format_exception(e)}') + raise RuntimeError( + 'Failed to launch the sky serve replica cluster ' + f'{cluster_name} due to invalid task yaml file.') from e retry_cnt = 0 backoff = common_utils.Backoff(_RETRY_INIT_GAP_SECONDS) while True: @@ -464,7 +471,7 @@ def _download_and_stream_logs(info: ReplicaInfo): if handle is None: logger.error(f'Cannot find cluster {info.cluster_name} for ' f'replica {replica_id} in the cluster table. ' - 'Skipping syncing down logs.') + 'Skipping syncing down job logs.') return assert isinstance(handle, backends.CloudVmRayResourceHandle) replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index f77c71c2b98..3b286d8a1f3 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -8,6 +8,7 @@ import shlex import threading import time +import traceback import typing from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, TextIO, Type, TypeVar) @@ -171,12 +172,19 @@ def run(self, *args, **kwargs): # reconfigure logger since the logger is initialized before # with previous stdout/stderr sky_logging.reload_logger() + logger = sky_logging.init_logger(__name__) # The subprocess_util.run('sky status') inside # sky.execution::_execute cannot be redirect, since we cannot # directly operate on the stdout/stderr of the subprocess. This # is because some code in skypilot will specify the stdout/stderr # of the subprocess. - self.func(*args, **kwargs) + try: + self.func(*args, **kwargs) + except Exception as e: # pylint: disable=broad-except + logger.error(f'Failed to run {self.func.__name__}. ' + f'Details: {common_utils.format_exception(e)}\n' + f'Traceback:\n{traceback.format_exc()}') + raise def generate_remote_service_dir_name(service_name: str) -> str: From 8377b8a79d03dbd7f606948ae780a0e095b359e2 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 7 Nov 2023 14:16:10 -0800 Subject: [PATCH 165/223] align output --- sky/backends/cloud_vm_ray_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 3a342d79571..14d756cbb23 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3401,7 +3401,7 @@ def _exec_code_on_head( '\nTo see detailed info:\t\t' f'{backend_utils.BOLD}sky serve status {sn} (-a)' f'{backend_utils.RESET_BOLD}' - '\nTo teardown the service:\t' + '\nTo teardown the service:\t\t' f'{backend_utils.BOLD}sky serve down {sn}' f'{backend_utils.RESET_BOLD}' '\n' From 114e751ae0aff328744f479ce6bb86c6f87718e0 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 7 Nov 2023 14:29:41 -0800 Subject: [PATCH 166/223] ux --- sky/serve/replica_managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 64a2d6e7058..d3455e47024 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -59,8 +59,8 @@ def launch_cluster(task_yaml_path: str, logger.error('Failed to construct task object from yaml file with ' f'error {common_utils.format_exception(e)}') raise RuntimeError( - 'Failed to launch the sky serve replica cluster ' - f'{cluster_name} due to invalid task yaml file.') from e + f'Failed to launch the sky serve replica cluster {cluster_name} ' + 'due to failing to initialize sky.Task from yaml file.') from e retry_cnt = 0 backoff = common_utils.Backoff(_RETRY_INIT_GAP_SECONDS) while True: From 30b36ba9bae3c2c858c23118032ae0c9572e2ab1 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 7 Nov 2023 14:37:31 -0800 Subject: [PATCH 167/223] fix storage cleanup failure --- sky/serve/service.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sky/serve/service.py b/sky/serve/service.py index 9cb0e3a4987..9e0ef0cc81a 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -89,9 +89,13 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: info) failed = True logger.error(f'Replica {info.replica_id} failed to terminate.') - task = task_lib.Task.from_yaml(task_yaml) - backend = cloud_vm_ray_backend.CloudVmRayBackend() - backend.teardown_ephemeral_storage(task) + try: + task = task_lib.Task.from_yaml(task_yaml) + backend = cloud_vm_ray_backend.CloudVmRayBackend() + backend.teardown_ephemeral_storage(task) + except Exception as e: # pylint: disable=broad-except + logger.error(f'Failed to clean up storage: {e}') + failed = True return failed From 74c83af84d79b642dfa288176c802ea5ab9c2d4b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 8 Nov 2023 09:38:23 -0800 Subject: [PATCH 168/223] fix extra newline --- sky/utils/cli_utils/status_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 7cbca6e0738..ed64dd1d468 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -174,7 +174,7 @@ def format_replica_table(replica_records: List[_ReplicaRecord], truncate_hint = '' if not show_all: if len(replica_records) > REPLICA_TRUNC_NUM: - truncate_hint = '... (use --all to show all replicas)\n' + truncate_hint = '\n... (use --all to show all replicas)' replica_records = replica_records[:REPLICA_TRUNC_NUM] columns = [] @@ -189,7 +189,7 @@ def format_replica_table(replica_records: List[_ReplicaRecord], row.append(status_column.calc(record)) replica_table.add_row(row) - return f'{replica_table}\n{truncate_hint}' + return f'{replica_table}{truncate_hint}' def get_total_cost_of_displayed_records( From d18badb1e26682d4355c790ef21351b971cbd7db Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 8 Nov 2023 10:10:31 -0800 Subject: [PATCH 169/223] comments --- examples/serve/llama2/llama2.yaml | 2 ++ examples/serve/ray_serve/serve.py | 1 + sky/__init__.py | 6 ------ sky/cli.py | 16 +++++++++------- sky/core.py | 8 ++++---- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/examples/serve/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml index 54ff06d67fc..cd1e9f481ba 100644 --- a/examples/serve/llama2/llama2.yaml +++ b/examples/serve/llama2/llama2.yaml @@ -10,6 +10,8 @@ # could also check the endpoint by running: # sky serve status --endpoint llama2 +# TODO(tian): Change usage to `HF_TOKEN= sky serve up -n llama2 examples/serve/llama2/llama2.yaml --env HF_TOKEN` once we have `--env` enabled. + service: port: 8087 readiness_probe: /v1/models diff --git a/examples/serve/ray_serve/serve.py b/examples/serve/ray_serve/serve.py index 8aecabd3e78..995d1c966db 100644 --- a/examples/serve/ray_serve/serve.py +++ b/examples/serve/ray_serve/serve.py @@ -4,6 +4,7 @@ from starlette import requests +# 2 Ray actors, each running on 1 vCPU. @serve.deployment(route_prefix='/', num_replicas=2) class ModelDeployment: diff --git a/sky/__init__.py b/sky/__init__.py index 88f773f8d76..38f9ae36a12 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -70,9 +70,6 @@ def get_git_commit(): from sky.optimizer import Optimizer from sky.optimizer import OptimizeTarget from sky.resources import Resources -from sky.serve import ReplicaStatus -from sky.serve import ServiceComponent -from sky.serve import ServiceStatus from sky.skylet.job_lib import JobStatus from sky.status_lib import ClusterStatus from sky.task import Task @@ -109,10 +106,7 @@ def get_git_commit(): 'Storage', 'StorageMode', 'StoreType', - 'ServiceComponent', 'ClusterStatus', - 'ReplicaStatus', - 'ServiceStatus', 'JobStatus', # APIs 'Dag', diff --git a/sky/cli.py b/sky/cli.py index b2a748a3440..890c1afb95e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -51,6 +51,7 @@ from sky import core from sky import exceptions from sky import global_user_state +from sky import serve as serve_lib from sky import sky_logging from sky import spot as spot_lib from sky import status_lib @@ -4289,8 +4290,9 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): connection or there are too many requests overwhelming the replica. - ``SHUTTING_DOWN``: The replica is being shut down. This usually happens - when the replica is being scaled down or some error occurred. SkyServe - will terminate all replicas that errored. + when the replica is being scaled down, some error occurred, or the + `sky serve down` command is called. SkyServe will terminate all replicas + that errored. - ``FAILED``: Some error occurred when the replica is serving requests. This indicates that the replica is already shut down. (Otherwise, it is @@ -4467,22 +4469,22 @@ def serve_logs( if target is not None: click.secho(f'Overriding --target={target} with --controller.', fg='yellow') - target_component = sky.ServiceComponent.CONTROLLER + target_component = serve_lib.ServiceComponent.CONTROLLER elif load_balancer: if target is not None: click.secho(f'Overriding --target={target} with --load-balancer.', fg='yellow') - target_component = sky.ServiceComponent.LOAD_BALANCER + target_component = serve_lib.ServiceComponent.LOAD_BALANCER elif target is not None: # Change load-balancer to load_balancer to match the enum. target = target.replace('-', '_') - target_component = sky.ServiceComponent(target) - if (target_component == sky.ServiceComponent.REPLICA and + target_component = serve_lib.ServiceComponent(target) + if (target_component == serve_lib.ServiceComponent.REPLICA and not have_replica_id): raise click.UsageError( 'REPLICA_ID must be specified when using --target replica.') else: - target_component = sky.ServiceComponent.REPLICA + target_component = serve_lib.ServiceComponent.REPLICA try: core.serve_tail_logs(service_name, target=target_component, diff --git a/sky/core.py b/sky/core.py index c604264254a..064ec136640 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1027,7 +1027,7 @@ def serve_status( { 'replica_id': (int) replica id, 'name': (str) replica name, - 'status': (sky.ReplicaStatus) replica status, + 'status': (sky.serve.ReplicaStatus) replica status, 'handle': (ResourceHandle) handle of the replica cluster, } @@ -1176,7 +1176,7 @@ def serve_tail_logs( # replica_id=3, # Must be specified when target is REPLICA. ) - `target` is a enum of sky.ServiceComponent, which can be one of: + `target` is a enum of sky.serve.ServiceComponent, which can be one of: - CONTROLLER - LOAD_BALANCER - REPLICA @@ -1187,7 +1187,7 @@ def serve_tail_logs( To tail controller logs: # follow default to True core.serve_tail_logs( - service_name, target=sky.ServiceComponent.CONTROLLER) + service_name, target=sky.serve.ServiceComponent.CONTROLLER) To print replica 3 logs: # Pass target as a lower-case string is also supported. @@ -1204,7 +1204,7 @@ def serve_tail_logs( if not isinstance(target, serve.ServiceComponent): with ux_utils.print_exception_no_traceback(): raise ValueError(f'`target` must be a string or ' - f'sky.ServiceComponent, got {type(target)}.') + f'sky.serve.ServiceComponent, got {type(target)}.') if target == serve.ServiceComponent.REPLICA: if replica_id is None: with ux_utils.print_exception_no_traceback(): From c4b8b2442e19de17c8e203a21aadbbd2276b3354 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Wed, 8 Nov 2023 10:13:40 -0800 Subject: [PATCH 170/223] Apply suggestions from code review Co-authored-by: Zongheng Yang --- sky/backends/backend_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 1cd9cf4038f..6c9891ac9b4 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -105,7 +105,7 @@ @dataclasses.dataclass -class ControllerSpec: +class _ControllerSpec: """Spec for skypilot controllers.""" name: str cluster_name: str From e241aff8177c03f85ddf4d8c40318b94c74d8bd3 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 8 Nov 2023 10:14:05 -0800 Subject: [PATCH 171/223] rename --- sky/backends/backend_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 6c9891ac9b4..c316342d73c 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -121,7 +121,7 @@ class Controllers(enum.Enum): """Skypilot controllers.""" # NOTE(dev): Keep this align with # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE - SPOT_CONTROLLER = ControllerSpec( + SPOT_CONTROLLER = _ControllerSpec( name='managed spot controller', cluster_name=spot_lib.SPOT_CONTROLLER_NAME, sky_status_hint=( @@ -147,7 +147,7 @@ class Controllers(enum.Enum): f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' 'managed spot controller. '), default_hint_if_non_existent='No managed spot jobs are found.') - SKY_SERVE_CONTROLLER = ControllerSpec( + SKY_SERVE_CONTROLLER = _ControllerSpec( name='sky serve controller', cluster_name=serve_lib.SKY_SERVE_CONTROLLER_NAME, sky_status_hint=( From 6ba5c9fc4f6a1e6f5fe883114f8ea7d748f9c020 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Wed, 8 Nov 2023 10:16:24 -0800 Subject: [PATCH 172/223] Update sky/core.py Co-authored-by: Zongheng Yang --- sky/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/core.py b/sky/core.py index 064ec136640..f55217b0112 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1063,7 +1063,7 @@ def serve_status( # When the controller is STOPPED, the head_ip will be None, as # it will be set in global_user_state.remove_cluster(). # We do not directly check for UP because the controller may be - # in INIT state during another spot launch, but still have + # in INIT state during another `sky serve up`, but still have # head_ip available. In this case, we can still try to ssh # into the controller and fetch the job table. raise exceptions.ClusterNotUpError('Sky serve controller is not up.', From 8ffe0be130a1de766c0505ecf3e702a3adc1ee33 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 8 Nov 2023 11:07:06 -0800 Subject: [PATCH 173/223] apply suggestion from code review --- sky/core.py | 6 ++-- sky/execution.py | 78 +++++++++++++++++++++------------------- sky/serve/__init__.py | 2 +- sky/serve/serve_utils.py | 42 +++++++++++----------- sky/skylet/constants.py | 8 ++--- sky/utils/schemas.py | 48 ++++++++++++++----------- 6 files changed, 98 insertions(+), 86 deletions(-) diff --git a/sky/core.py b/sky/core.py index f55217b0112..e148da91725 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1072,8 +1072,8 @@ def serve_status( backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) - code = serve.ServeCodeGen.get_latest_info(service_names) - returncode, latest_info_payload, stderr = backend.run_on_head( + code = serve.ServeCodeGen.get_serve_status(service_names) + returncode, serve_status_payload, stderr = backend.run_on_head( handle, code, require_outputs=True, @@ -1089,7 +1089,7 @@ def serve_status( except exceptions.CommandError as e: raise RuntimeError(e.error_msg) from e - return serve.load_latest_info(latest_info_payload) + return serve.load_serve_status(serve_status_payload) @usage_lib.entrypoint diff --git a/sky/execution.py b/sky/execution.py index 9b3e54f9b1d..b3d0a60165c 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -659,7 +659,7 @@ def spot_launch( dag_utils.fill_default_spot_config_in_dag_for_spot_launch(dag) for task_ in dag.tasks: - _maybe_translate_local_file_mounts_and_sync_up(task_, prefix='spot') + _maybe_translate_local_file_mounts_and_sync_up(task_, path='spot') with tempfile.NamedTemporaryFile(prefix=f'spot-dag-{dag.name}-', mode='w') as f: @@ -791,7 +791,7 @@ def spot_launch( def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task, - prefix: str): + path: str): """Translates local->VM mounts into Storage->VM, then syncs up any Storage. Eagerly syncing up local->Storage ensures Storage->VM would work at task @@ -863,7 +863,7 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task, if os.path.isfile(os.path.abspath(os.path.expanduser(src))): copy_mounts_with_file_in_src[dst] = src continue - bucket_name = constants.FM_BUCKET_NAME.format( + bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( username=getpass.getuser(), id=f'{run_id}-{i}', ) @@ -879,13 +879,14 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task, # Step 3: Translate local file mounts with file in src to SkyPilot storage. # Hard link the files in src to a temporary directory, and upload folder. - local_fm_path = os.path.join(tempfile.gettempdir(), - constants.FM_LOCAL_TMP_DIR.format(id=run_id)) + local_fm_path = os.path.join( + tempfile.gettempdir(), + constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)) os.makedirs(local_fm_path, exist_ok=True) - file_bucket_name = constants.FM_FILE_ONLY_BUCKET_NAME.format( + file_bucket_name = constants.FILE_MOUNTS_FILE_ONLY_BUCKET_NAME.format( username=getpass.getuser(), id=run_id) - file_mount_remote_tmp_dir = constants.FM_REMOTE_TMP_DIR.format( - prefix=prefix) + file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format( + path) if copy_mounts_with_file_in_src: src_to_file_id = {} for i, src in enumerate(set(copy_mounts_with_file_in_src.values())): @@ -961,26 +962,13 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task, storage_obj.force_delete = True -def _register_service_name(service_name: str) -> bool: +def _register_service_name(service_name: str, + handle: backends.CloudVmRayResourceHandle) -> bool: """Register a service name on the controller if it is running. Returns: - True if the service name is registered successfully, False otherwise. + True if the service name is not occupied, False otherwise. """ - with sky_logging.silent(): - _, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, - stopped_message='') - if handle is None or handle.head_ip is None: - # The sky serve controller is STOPPED, or it is the first time - # provisioning either after an AUTOSTOP, or the first time the - # controller is created, which means there is no service on the - # controller. We will create the service database record in - # sky.serve.service._start once the controller is running. - logger.info('The sky serve controller is not running. ' - 'Will register the service once the controller is up.') - return True - # The sky serve controller is UP, check if the service exists. code = serve.ServeCodeGen.add_service_if_not_exist(service_name) backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) @@ -1006,9 +994,6 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: ('serve', 'controller', 'resources'), None) if custom_controller_resources_config is not None: controller_resources_config.update(custom_controller_resources_config) - if 'ports' in controller_resources_config: - with ux_utils.print_exception_no_traceback(): - raise ValueError('Cannot specify ports for controller resources.') try: controller_resources = sky.Resources.from_yaml_config( controller_resources_config) @@ -1034,24 +1019,43 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: with rich_utils.safe_status( '[cyan]Registering service on the controller[/]'): - success = _register_service_name(service_name) - if not success: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'The service {service_name!r} is already running. ' - 'Update service will be supported in the future. For now, ' - '`sky serve down` and then `sky serve up` again.') + with sky_logging.silent(): + status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + stopped_message='') + if handle is None or handle.head_ip is None: + # The sky serve controller is STOPPED, or it is the first time + # provisioning either after an AUTOSTOP, or the first time the + # controller is created, which means there is no service on the + # controller. We will create the service database record in + # sky.serve.service._start once the controller is running. + logger.info('The sky serve controller is not running. ' + 'Will register the service once the controller is up.') + else: + # The sky serve controller is UP, check if the service exists. + success = _register_service_name(service_name, handle) + if not success: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'The service {service_name!r} is already running. ' + 'Updating a service will be supported in the future. ' + 'For now, `sky serve down` and then `sky serve up` ' + 'again.') controller_name = serve.SKY_SERVE_CONTROLLER_NAME # TODO(tian): Probably run another sky.launch after we get the load balancer # port from the controller? So we don't need to open so many ports here. Or, # we should have a nginx traffic control to refuse any connection to the # unregistered ports. - # TODO(tian): Probably choose the same cloud if replica cloud is specified? + # Choose the same cloud if controller is not launched, controller resources + # not specify cloud and replica cloud is specified. + controller_cloud = (requested_resources.cloud if status is None and + controller_resources.cloud is None and + requested_resources.cloud is not None else None) controller_resources = controller_resources.copy( - ports=[serve.LOAD_BALANCER_PORT_RANGE]) + cloud=controller_cloud, ports=[serve.LOAD_BALANCER_PORT_RANGE]) - _maybe_translate_local_file_mounts_and_sync_up(task, prefix='serve') + _maybe_translate_local_file_mounts_and_sync_up(task, path='serve') with tempfile.NamedTemporaryFile( prefix=f'service-task-{service_name}-', diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index fe91b577bd6..7955900ccaf 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -14,7 +14,7 @@ from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name from sky.serve.serve_utils import load_add_service_result -from sky.serve.serve_utils import load_latest_info +from sky.serve.serve_utils import load_serve_status from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent from sky.serve.serve_utils import SKY_SERVE_CONTROLLER_NAME diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 3b286d8a1f3..9a3ca84b378 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -297,8 +297,8 @@ def get_replica_info(service_name: str, ] -def get_latest_info(service_name: str, - with_replica_info: bool = True) -> Dict[str, Any]: +def get_serve_status(service_name: str, + with_replica_info: bool = True) -> Dict[str, Any]: """Get the latest information of the service. Args: @@ -317,37 +317,37 @@ def get_latest_info(service_name: str, return record -def get_latest_info_encoded(service_names: Optional[List[str]]) -> str: - latest_infos = [] +def get_serve_status_encoded(service_names: Optional[List[str]]) -> str: + serve_statuss = [] if service_names is None: # Get all service names service_names = serve_state.get_glob_service_names(None) for service_name in service_names: - latest_info = get_latest_info(service_name) - latest_infos.append({ + serve_status = get_serve_status(service_name) + serve_statuss.append({ k: base64.b64encode(pickle.dumps(v)).decode('utf-8') - for k, v in latest_info.items() + for k, v in serve_status.items() }) - return common_utils.encode_payload(latest_infos) + return common_utils.encode_payload(serve_statuss) -def load_latest_info(payload: str) -> List[Dict[str, Any]]: - latest_infos_encoded = common_utils.decode_payload(payload) - latest_infos = [] - for latest_info in latest_infos_encoded: - latest_infos.append({ +def load_serve_status(payload: str) -> List[Dict[str, Any]]: + serve_statuss_encoded = common_utils.decode_payload(payload) + serve_statuss = [] + for serve_status in serve_statuss_encoded: + serve_statuss.append({ k: pickle.loads(base64.b64decode(v)) - for k, v in latest_info.items() + for k, v in serve_status.items() }) - return latest_infos + return serve_statuss def terminate_services(service_names: Optional[List[str]]) -> str: service_names = serve_state.get_glob_service_names(service_names) terminated_service_names = [] for service_name in service_names: - latest_info = get_latest_info(service_name, with_replica_info=False) - if (latest_info['status'] + serve_status = get_serve_status(service_name, with_replica_info=False) + if (serve_status['status'] in serve_state.ServiceStatus.refuse_to_terminate_statuses()): # TODO(tian): Cleanup replicas for CONTROLLER_FAILED status. Seems # like spot doesn't implement this yet? @@ -555,11 +555,13 @@ def _service_is_terminal() -> bool: return '' +# TODO(tian): Use REST API instead of SSH in the future. This will require +# authentication. class ServeCodeGen: """Code generator for SkyServe. Usage: - >> code = ServeCodeGen.get_latest_info(service_name) + >> code = ServeCodeGen.get_serve_status(service_name) """ _PREFIX = [ 'from sky.serve import serve_state', @@ -575,9 +577,9 @@ def add_service_if_not_exist(cls, service_name: str) -> str: return cls._build(code) @classmethod - def get_latest_info(cls, service_names: Optional[List[str]]) -> str: + def get_serve_status(cls, service_names: Optional[List[str]]) -> str: code = [ - f'msg = serve_utils.get_latest_info_encoded({service_names!r})', + f'msg = serve_utils.get_serve_status_encoded({service_names!r})', 'print(msg, end="", flush=True)' ] return cls._build(code) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 3fb187c0c15..b2a2583d637 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -97,7 +97,7 @@ CLUSTER_NAME_VALID_REGEX = '[a-z]([-a-z0-9]*[a-z0-9])?' WORKDIR_BUCKET_NAME = 'skypilot-workdir-{username}-{id}' -FM_BUCKET_NAME = 'skypilot-filemounts-folder-{username}-{id}' -FM_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}' -FM_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}' -FM_REMOTE_TMP_DIR = '/tmp/sky-{prefix}-filemounts-files' +FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-folder-{username}-{id}' +FILE_MOUNTS_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}' +FILE_MOUNTS_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}' +FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files' diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 51a0a55620f..ac523b00c5e 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -333,34 +333,40 @@ def get_cluster_schema(): def get_config_schema(): # pylint: disable=import-outside-toplevel from sky.utils import kubernetes_enums - controller_resources_schema = { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'controller': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'resources': { - k: v - for k, v in get_resources_schema().items() - # Validation may fail if $schema is included. - if k != '$schema' - }, - } - }, + + def _get_controller_resources_schema(is_serve: bool = False): + resources_schema = { + k: v + for k, v in get_resources_schema().items() + # Validation may fail if $schema is included. + if k != '$schema' } - } + if is_serve: + resources_schema['properties'].pop('ports') + return { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'controller': { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'resources': resources_schema, + } + }, + } + } + return { '$schema': 'https://json-schema.org/draft/2020-12/schema', 'type': 'object', 'required': [], 'additionalProperties': False, 'properties': { - 'spot': controller_resources_schema, - 'serve': controller_resources_schema, + 'spot': _get_controller_resources_schema(), + 'serve': _get_controller_resources_schema(is_serve=True), 'aws': { 'type': 'object', 'required': [], From bb0542c0658cdc818038905657c060e91aa16c37 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 8 Nov 2023 17:48:29 -0800 Subject: [PATCH 174/223] apply suggestion from code review --- sky/backends/backend_utils.py | 21 +- sky/backends/cloud_vm_ray_backend.py | 3 +- sky/execution.py | 229 +++++++++++---------- sky/serve/__init__.py | 5 +- sky/serve/autoscalers.py | 18 +- sky/serve/constants.py | 51 +++-- sky/serve/controller.py | 53 +++-- sky/serve/load_balancer.py | 54 ++--- sky/serve/load_balancing_policies.py | 16 +- sky/serve/replica_managers.py | 159 +++++++------- sky/serve/serve_utils.py | 33 +-- sky/serve/service.py | 5 +- sky/serve/service_spec.py | 3 +- sky/skylet/constants.py | 3 + sky/skylet/events.py | 6 +- sky/templates/sky-serve-controller.yaml.j2 | 3 + sky/templates/spot-controller.yaml.j2 | 2 +- sky/utils/cli_utils/status_utils.py | 2 +- sky/utils/subprocess_utils.py | 11 +- tests/test_smoke.py | 24 ++- 20 files changed, 375 insertions(+), 326 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index c316342d73c..37a66bbd9ab 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2891,12 +2891,18 @@ def get_backend_from_handle( def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: - """Returns the resources dict of the task""" - # TODO: CPU and other memory resources are not supported yet - # except for sky serve controller task. + """Returns the resources dict of the task. + + Returns: + A dict of the resources of the task. The keys are the resource names + and the values are the number of the resources. It always contains + the CPU resource (to control the maximum number of tasks), and + optionally accelerator demands. + """ + # TODO: Custom CPU and other memory resources are not supported yet. + # For sky serve controller task, we set the CPU resource to a smaller + # value to support a larger number of services. resources_dict = { - # We set CPU resource for sky serve controller to a smaller value - # to support a larger number of services. 'CPU': (serve_lib.SERVICES_TASK_CPU_DEMAND if task.service_name is not None else DEFAULT_TASK_CPU_DEMAND) } @@ -2913,6 +2919,11 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: def get_task_resources_str(task: 'task_lib.Task') -> str: + """Returns the resources string of the task. + + The resources string is only used as a display purpose, so we only show + the accelerator demands (if any). Otherwise, the CPU demand is shown. + """ resources_dict = get_task_demands_dict(task) if len(resources_dict) > 1: resources_dict.pop('CPU') diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 14d756cbb23..11a7e8b2e0c 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -321,8 +321,7 @@ def add_gang_scheduling_placement_group_and_setup( assert len(resources_dict) == 1, ( 'There can only be one type of accelerator per instance. ' f'Found: {resources_dict}.') - acc_name = list(resources_dict.keys())[0] - acc_count = list(resources_dict.values())[0] + acc_name, acc_count = list(resources_dict.items())[0] gpu_dict = {'GPU': acc_count} # gpu_dict should be empty when the accelerator is not GPU. # FIXME: This is a hack to make sure that we do not reserve diff --git a/sky/execution.py b/sky/execution.py index b3d0a60165c..7438863ac80 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -8,7 +8,7 @@ import os import re import tempfile -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import uuid import colorama @@ -614,6 +614,96 @@ def _shared_controller_env_vars() -> Dict[str, Any]: return env_vars +def _controller_skypilot_config_setup( + controller_type: str, + controller_resources_config: Dict[str, Any], + remote_user_config_path: str, +) -> Tuple[Dict[str, Any], 'sky.Resources']: + """Read the skypilot config and setup the controller resources. + + Returns: + A tuple of (vars_to_fill, controller_resources). + """ + vars_to_fill: Dict[str, Any] = {} + controller_envs = _shared_controller_env_vars() + controller_resources_config_copied: Dict[str, Any] = copy.copy( + controller_resources_config) + if skypilot_config.loaded(): + # Look up the contents of the already loaded configs via the + # 'skypilot_config' module. Don't simply read the on-disk file as + # it may have changed since this process started. + # + # Set any proxy command to None, because the controller would've + # been launched behind the proxy, and in general any nodes we + # launch may not have or need the proxy setup. (If the controller + # needs to launch mew clusters in another region/VPC, the user + # should properly set up VPC peering, which will allow the + # cross-region/VPC communication. The proxy command is orthogonal + # to this scenario.) + # + # This file will be uploaded to the controller node and will be + # used throughout the spot job's / service's recovery attempts + # (i.e., if it relaunches due to preemption, we make sure the + # same config is used). + # + # NOTE: suppose that we have a controller in old VPC, then user + # changes 'vpc_name' in the config and does a 'spot launch' / + # 'serve up'. In general, the old controller may not successfully + # launch the job in the new VPC. This happens if the two VPCs don’t + # have peering set up. Like other places in the code, we assume + # properly setting up networking is user's responsibilities. + # TODO(zongheng): consider adding a basic check that checks + # controller VPC (or name) == the spot job's / service's VPC + # (or name). It may not be a sufficient check (as it's always + # possible that peering is not set up), but it may catch some + # obvious errors. + # TODO(zhwu): hacky. We should only set the proxy command of the + # cloud where the controller is launched (currently, only aws user + # uses proxy_command). + proxy_command_key = ('aws', 'ssh_proxy_command') + ssh_proxy_command = skypilot_config.get_nested(proxy_command_key, None) + config_dict = skypilot_config.to_dict() + if isinstance(ssh_proxy_command, str): + config_dict = skypilot_config.set_nested(proxy_command_key, None) + elif isinstance(ssh_proxy_command, dict): + # Instead of removing the key, we set the value to empty string + # so that the controller will only try the regions specified by + # the keys. + ssh_proxy_command = {k: None for k in ssh_proxy_command} + config_dict = skypilot_config.set_nested(proxy_command_key, + ssh_proxy_command) + + with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmpfile: + common_utils.dump_yaml(tmpfile.name, config_dict) + controller_envs[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = ( + remote_user_config_path) + vars_to_fill.update({ + 'user_config_path': tmpfile.name, + 'remote_user_config_path': remote_user_config_path, + }) + + # Override the controller resources with the ones specified in the + # config. + custom_controller_resources_config = skypilot_config.get_nested( + (controller_type, 'controller', 'resources'), None) + if custom_controller_resources_config is not None: + controller_resources_config_copied.update( + custom_controller_resources_config) + try: + controller_resources = sky.Resources.from_yaml_config( + controller_resources_config_copied) + except ValueError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format( + controller_type=controller_type, + err=common_utils.format_exception(e, + use_bracket=True))) from e + + vars_to_fill['envs'] = controller_envs + return vars_to_fill, controller_resources + + @usage_lib.entrypoint def spot_launch( task: Union['sky.Task', 'sky.Dag'], @@ -665,6 +755,10 @@ def spot_launch( mode='w') as f: dag_utils.dump_chain_dag_to_yaml(dag, f.name) controller_name = spot.SPOT_CONTROLLER_NAME + extra_vars, controller_resources = _controller_skypilot_config_setup( + controller_type='spot', + controller_resources_config=spot.constants.CONTROLLER_RESOURCES, + remote_user_config_path=f'{dag.name}-{dag_uuid}.config_yaml') vars_to_fill = { 'remote_user_yaml_prefix': spot.SPOT_TASK_YAML_PREFIX, 'user_yaml_path': f.name, @@ -676,86 +770,8 @@ def spot_launch( 'google_sdk_installation_commands': gcp.GOOGLE_SDK_INSTALLATION_COMMAND, 'retry_until_up': retry_until_up, + **extra_vars, } - controller_resources_config = copy.copy( - spot.constants.CONTROLLER_RESOURCES) - spot_env_vars = _shared_controller_env_vars() - if skypilot_config.loaded(): - # Look up the contents of the already loaded configs via the - # 'skypilot_config' module. Don't simply read the on-disk file as - # it may have changed since this process started. - # - # Set any proxy command to None, because the controller would've - # been launched behind the proxy, and in general any nodes we - # launch may not have or need the proxy setup. (If the controller - # needs to launch spot clusters in another region/VPC, the user - # should properly set up VPC peering, which will allow the - # cross-region/VPC communication. The proxy command is orthogonal - # to this scenario.) - # - # This file will be uploaded to the controller node and will be - # used throughout the spot job's recovery attempts (i.e., if it - # relaunches due to preemption, we make sure the same config is - # used). - # - # NOTE: suppose that we have a controller in old VPC, then user - # changes 'vpc_name' in the config and does a 'spot launch'. In - # general, the old controller may not successfully launch the job - # in the new VPC. This happens if the two VPCs don’t have peering - # set up. Like other places in the code, we assume properly setting - # up networking is user's responsibilities. - # TODO(zongheng): consider adding a basic check that checks - # controller VPC (or name) == the spot job's VPC (or name). It may - # not be a sufficient check (as it's always possible that peering - # is not set up), but it may catch some obvious errors. - # TODO(zhwu): hacky. We should only set the proxy command of the - # cloud where the controller is launched (currently, only aws user - # uses proxy_command). - proxy_command_key = ('aws', 'ssh_proxy_command') - ssh_proxy_command = skypilot_config.get_nested( - proxy_command_key, None) - config_dict = skypilot_config.to_dict() - if isinstance(ssh_proxy_command, str): - config_dict = skypilot_config.set_nested( - proxy_command_key, None) - elif isinstance(ssh_proxy_command, dict): - # Instead of removing the key, we set the value to empty string - # so that the controller will only try the regions specified by - # the keys. - ssh_proxy_command = {k: None for k in ssh_proxy_command} - config_dict = skypilot_config.set_nested( - proxy_command_key, ssh_proxy_command) - - with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmpfile: - prefix = spot.SPOT_TASK_YAML_PREFIX - remote_user_config_path = ( - f'{prefix}/{dag.name}-{dag_uuid}.config_yaml') - common_utils.dump_yaml(tmpfile.name, config_dict) - spot_env_vars[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = ( - remote_user_config_path) - vars_to_fill.update({ - 'user_config_path': tmpfile.name, - 'remote_user_config_path': remote_user_config_path, - 'envs': spot_env_vars, - }) - - # Override the controller resources with the ones specified in the - # config. - custom_controller_resources_config = skypilot_config.get_nested( - ('spot', 'controller', 'resources'), None) - if custom_controller_resources_config is not None: - controller_resources_config.update( - custom_controller_resources_config) - try: - controller_resources = sky.Resources.from_yaml_config( - controller_resources_config) - except ValueError as e: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format( - controller_type='spot', - err=common_utils.format_exception( - e, use_bracket=True))) from e yaml_path = os.path.join(spot.SPOT_CONTROLLER_YAML_PREFIX, f'{name}-{dag_uuid}.yaml') @@ -986,26 +1002,7 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: if task.service is None: with ux_utils.print_exception_no_traceback(): raise RuntimeError('Service section not found.') - controller_resources_config: Dict[str, Any] = copy.copy( - serve.CONTROLLER_RESOURCES) - # Override the controller resources with the ones specified in the - # config. - custom_controller_resources_config = skypilot_config.get_nested( - ('serve', 'controller', 'resources'), None) - if custom_controller_resources_config is not None: - controller_resources_config.update(custom_controller_resources_config) - try: - controller_resources = sky.Resources.from_yaml_config( - controller_resources_config) - except ValueError as e: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format( - controller_type='serve', - err=common_utils.format_exception(e, use_bracket=True), - )) from e - assert task.service is not None, task assert len(task.resources) == 1, task requested_resources = list(task.resources)[0] if requested_resources.ports is not None: @@ -1042,19 +1039,6 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: 'For now, `sky serve down` and then `sky serve up` ' 'again.') - controller_name = serve.SKY_SERVE_CONTROLLER_NAME - # TODO(tian): Probably run another sky.launch after we get the load balancer - # port from the controller? So we don't need to open so many ports here. Or, - # we should have a nginx traffic control to refuse any connection to the - # unregistered ports. - # Choose the same cloud if controller is not launched, controller resources - # not specify cloud and replica cloud is specified. - controller_cloud = (requested_resources.cloud if status is None and - controller_resources.cloud is None and - requested_resources.cloud is not None else None) - controller_resources = controller_resources.copy( - cloud=controller_cloud, ports=[serve.LOAD_BALANCER_PORT_RANGE]) - _maybe_translate_local_file_mounts_and_sync_up(task, path='serve') with tempfile.NamedTemporaryFile( @@ -1064,12 +1048,19 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: prefix=f'controller-task-{service_name}-', mode='w', ) as controller_file: + controller_name = serve.SKY_SERVE_CONTROLLER_NAME task_config = task.to_yaml_config() common_utils.dump_yaml(service_file.name, task_config) remote_task_yaml_path = ( serve.generate_remote_task_yaml_file_name(service_name)) + remote_config_yaml_path = ( + serve.generate_remote_config_yaml_file_name(service_name)) controller_log_file = ( serve.generate_remote_controller_log_file_name(service_name)) + extra_vars, controller_resources = _controller_skypilot_config_setup( + controller_type='serve', + controller_resources_config=serve.CONTROLLER_RESOURCES, + remote_user_config_path=remote_config_yaml_path) vars_to_fill = { 'remote_task_yaml_path': remote_task_yaml_path, 'local_task_yaml_path': service_file.name, @@ -1077,13 +1068,23 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: gcp.GOOGLE_SDK_INSTALLATION_COMMAND, 'service_name': service_name, 'controller_log_file': controller_log_file, - 'envs': _shared_controller_env_vars(), + **extra_vars, } backend_utils.fill_template(serve.CONTROLLER_TEMPLATE, vars_to_fill, output_path=controller_file.name) - # TODO(tian): Probably we should support customizable setup commands. controller_task = task_lib.Task.from_yaml(controller_file.name) + # Choose the same cloud if controller is not launched, controller + # resources not specify cloud and replica cloud is specified. + controller_cloud = (requested_resources.cloud if status is None and + controller_resources.cloud is None and + requested_resources.cloud is not None else None) + # TODO(tian): Probably run another sky.launch after we get the load + # balancer port from the controller? So we don't need to open so many + # ports here. Or, we should have a nginx traffic control to refuse + # any connection to the unregistered ports. + controller_resources = controller_resources.copy( + cloud=controller_cloud, ports=[serve.LOAD_BALANCER_PORT_RANGE]) controller_task.set_resources(controller_resources) # Set this to modify default ray task CPU usage to custom value diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 7955900ccaf..b1705919582 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -3,13 +3,14 @@ from sky.serve.constants import CONTROLLER_RESOURCES from sky.serve.constants import CONTROLLER_TEMPLATE -from sky.serve.constants import ENDPOINT_PROBE_INTERVAL -from sky.serve.constants import LB_CONTROLLER_SYNC_INTERVAL +from sky.serve.constants import ENDPOINT_PROBE_INTERVAL_SECONDS +from sky.serve.constants import LB_CONTROLLER_SYNC_INTERVAL_SECONDS from sky.serve.constants import LOAD_BALANCER_PORT_RANGE from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.constants import SKYSERVE_METADATA_DIR from sky.serve.serve_state import ReplicaStatus from sky.serve.serve_state import ServiceStatus +from sky.serve.serve_utils import generate_remote_config_yaml_file_name from sky.serve.serve_utils import generate_remote_controller_log_file_name from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 8f38cd04e53..1ea58583841 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -53,14 +53,14 @@ def __init__(self, spec: 'service_spec.SkyServiceSpec', self.min_replicas: int = spec.min_replicas self.max_replicas: int = spec.max_replicas or spec.min_replicas self.frequency = frequency - if self.frequency < constants.LB_CONTROLLER_SYNC_INTERVAL: + if self.frequency < constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS: logger.warning('Autoscaler frequency is less than ' 'controller sync interval. It might ' 'not always got the latest information.') - def update_request_information( - self, request_information: serve_utils.RequestInformation) -> None: - """Update request information for autoscaling.""" + def collect_request_information( + self, request_aggregator: serve_utils.RequestsAggregator) -> None: + """Collect request information from aggregator for autoscaling.""" raise NotImplementedError def evaluate_scaling(self, infos: List[Dict[str, @@ -97,13 +97,13 @@ def __init__(self, spec: 'service_spec.SkyServiceSpec', frequency: int, self.last_scale_operation: float = 0. self.request_timestamps: List[float] = [] - def update_request_information( - self, request_information: serve_utils.RequestInformation) -> None: - if not isinstance(request_information, serve_utils.RequestTimestamp): - raise ValueError('Request information must be of type ' + def collect_request_information( + self, request_aggregator: serve_utils.RequestsAggregator) -> None: + if not isinstance(request_aggregator, serve_utils.RequestTimestamp): + raise ValueError('Request aggregator must be of type ' 'serve_utils.RequestTimestamp for ' 'RequestRateAutoscaler.') - self.request_timestamps.extend(request_information.get()) + self.request_timestamps.extend(request_aggregator.get()) current_time = time.time() index = bisect.bisect_left(self.request_timestamps, current_time - self.rps_window_size) diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 3e1d25be52b..ecb3ff852a5 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -4,39 +4,37 @@ SKYSERVE_METADATA_DIR = '~/.sky/serve' -# The filelock for selecting service ports when starting a service. We need to -# have a filelock to avoid port collision when starting multiple services at -# the same time. +# The filelock for selecting service ports on controller VM when starting a +# service. We need to have a filelock to avoid port collision when starting +# multiple services at the same time. PORT_SELECTION_FILE_LOCK_PATH = f'{SKYSERVE_METADATA_DIR}/port_selection.lock' # Signal file path for controller to handle signals. SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}' -# The time interval for load balancer to sync with controller. Every time the -# load balancer syncs with controller, it will update all available replica ips -# for each service, also send the number of requests in last query interval. -LB_CONTROLLER_SYNC_INTERVAL = 20 +# The time interval in seconds for load balancer to sync with controller. Every +# time the load balancer syncs with controller, it will update all available +# replica ips for each service, also send the number of requests in last query +# interval. +LB_CONTROLLER_SYNC_INTERVAL_SECONDS = 20 -# Interval to probe replica endpoint. -ENDPOINT_PROBE_INTERVAL = 10 +# Interval in seconds to probe replica endpoint. +ENDPOINT_PROBE_INTERVAL_SECONDS = 10 -# The default timeout for a readiness probe request. We set the timeout to 15s -# since using actual generation in LLM services as readiness probe is very -# time-consuming (33B, 70B, ...). +# The default timeout in seconds for a readiness probe request. We set the +# timeout to 15s since using actual generation in LLM services as readiness +# probe is very time-consuming (33B, 70B, ...). # TODO(tian): Expose this option to users in yaml file. -READINESS_PROBE_TIMEOUT = 15 +READINESS_PROBE_TIMEOUT_SECONDS = 15 -# Wait for 1 minutes for controller / load balancer to terminate. -SERVE_TERMINATE_WAIT_TIMEOUT = 60 - -# Autoscaler window size for request per second. We calculate rps by divide the -# number of requests in last window size by this window size. -AUTOSCALER_RPS_WINDOW_SIZE = 60 -# Autoscaler scale frequency. We will try to scale up/down every +# Autoscaler window size in seconds for request per second. We calculate rps by +# divide the number of requests in last window size by this window size. +AUTOSCALER_RPS_WINDOW_SIZE_SECONDS = 60 +# Autoscaler scale frequency in seconds. We will try to scale up/down every # `scale_frequency`. -AUTOSCALER_SCALE_FREQUENCY = 20 -# Autoscaler cooldown time. We will not scale up/down if the last scale up/down -# is within this cooldown time. +AUTOSCALER_SCALE_FREQUENCY_SECONDS = 20 +# Autoscaler cooldown time in seconds. We will not scale up/down if the last +# scale up/down is within this cooldown time. AUTOSCALER_COOLDOWN_SECONDS = 60 # The default controller resources. @@ -44,16 +42,17 @@ # size is 150 GB. Also, we need 32 GB memory to run our controller and load # balancer jobs since it is very memory demanding. # TODO(tian): We might need to be careful that service logs can take a lot of -# disk space. Maybe we could use a larger disk size or migrate to cloud storage. +# disk space. Maybe we could use a larger disk size, migrate to cloud storage or +# do some log rotation. CONTROLLER_RESOURCES = {'disk_size': 200, 'memory': '32+'} # Our ray jobs is very memory demanding and number of services on a single # controller is limited by memory. Rough benchmark result shows each service # needs ~0.6 GB to run only for controller and load balancer process. # Considering there will be some sky launch and sky down process on the fly, we -# set the memory usage to 2 GB to be safe. +# set the memory usage to 1 GB to be safe. # In this setup, a default highmem controller with 4 vCPU and 32 GB memory can -# run 16 services. +# run 32 services. # TODO(tian): Since now we only have one job, we set this to 1 GB. Should do # some benchmark to make sure this is safe. SERVICES_MEMORY_USAGE_GB = 1.0 diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 22d7fd9650f..d039524f162 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -39,69 +39,62 @@ class SkyServeController: def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec, task_yaml: str, port: int) -> None: - self.service_name = service_name - self.replica_manager: replica_managers.ReplicaManager = ( + self._service_name = service_name + self._replica_manager: replica_managers.ReplicaManager = ( replica_managers.SkyPilotReplicaManager(service_name=service_name, spec=service_spec, task_yaml_path=task_yaml)) - self.autoscaler: autoscalers.Autoscaler = ( + self._autoscaler: autoscalers.Autoscaler = ( autoscalers.RequestRateAutoscaler( service_spec, - frequency=constants.AUTOSCALER_SCALE_FREQUENCY, + frequency=constants.AUTOSCALER_SCALE_FREQUENCY_SECONDS, cooldown=constants.AUTOSCALER_COOLDOWN_SECONDS, - rps_window_size=constants.AUTOSCALER_RPS_WINDOW_SIZE)) - self.port = port - self.app = fastapi.FastAPI() + rps_window_size=constants.AUTOSCALER_RPS_WINDOW_SIZE_SECONDS)) + self._port = port + self._app = fastapi.FastAPI() def _run_autoscaler(self): - logger.info('Starting autoscaler monitor.') + logger.info('Starting autoscaler.') while True: try: replica_info = serve_utils.get_replica_info( - self.service_name, + self._service_name, with_handle=env_options.Options.SHOW_DEBUG_INFO.get()) logger.info(f'All replica info: {replica_info}') - scaling_option = self.autoscaler.evaluate_scaling(replica_info) + scaling_option = self._autoscaler.evaluate_scaling(replica_info) if (scaling_option.operator == autoscalers.AutoscalerDecisionOperator.SCALE_UP): assert isinstance(scaling_option.target, int), scaling_option - self.replica_manager.scale_up(scaling_option.target) + self._replica_manager.scale_up(scaling_option.target) elif (scaling_option.operator == autoscalers.AutoscalerDecisionOperator.SCALE_DOWN): assert isinstance(scaling_option.target, list), scaling_option - self.replica_manager.scale_down(scaling_option.target) + self._replica_manager.scale_down(scaling_option.target) except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # monitor running. logger.error(f'Error in autoscaler: {e}') - time.sleep(self.autoscaler.frequency) + time.sleep(self._autoscaler.frequency) def run(self) -> None: - @self.app.post('/controller/load_balancer_sync') + @self._app.post('/controller/load_balancer_sync') async def load_balancer_sync(request: fastapi.Request): request_data = await request.json() - request_information_payload = request_data.get( - 'request_information') - request_information = pickle.loads( - base64.b64decode(request_information_payload)) + request_aggregator_payload = request_data.get('request_aggregator') + request_aggregator = pickle.loads( + base64.b64decode(request_aggregator_payload)) logger.info( - f'Received inflight request information: {request_information}') - if isinstance(self.autoscaler, autoscalers.RequestRateAutoscaler): - if not isinstance(request_information, - serve_utils.RequestTimestamp): - raise ValueError('Request information must be of type ' - 'serve_utils.RequestTimestamp for ' - 'RequestRateAutoscaler.') - self.autoscaler.update_request_information(request_information) + f'Received inflight request information: {request_aggregator}') + self._autoscaler.collect_request_information(request_aggregator) return { 'ready_replica_ips': - self.replica_manager.get_ready_replica_ips() + self._replica_manager.get_ready_replica_ips() } - @self.app.on_event('startup') + @self._app.on_event('startup') def configure_logger(): uvicorn_access_logger = logging.getLogger('uvicorn.access') for handler in uvicorn_access_logger.handlers: @@ -110,9 +103,9 @@ def configure_logger(): threading.Thread(target=self._run_autoscaler).start() logger.info('SkyServe Controller started on ' - f'http://localhost:{self.port}') + f'http://localhost:{self._port}') - uvicorn.run(self.app, host='localhost', port=self.port) + uvicorn.run(self._app, host='localhost', port=self._port) def run_controller(service_name: str, service_spec: serve.SkyServiceSpec, diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 96b7fa30b39..a57276ba322 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -23,6 +23,9 @@ class SkyServeLoadBalancer: This class accept any traffic to the controller and redirect it to the appropriate endpoint replica according to the load balancing policy. + + NOTE: HTTP redirect is used. Thus, when using `curl`, be sure to use + `curl -L`. """ def __init__(self, controller_url: str, load_balancer_port: int, @@ -34,22 +37,23 @@ def __init__(self, controller_url: str, load_balancer_port: int, load_balancer_port: The port where the load balancer listens to. replica_port: The port where the replica app listens to. """ - self.app = fastapi.FastAPI() - self.controller_url = controller_url - self.load_balancer_port = load_balancer_port - self.replica_port = replica_port - self.load_balancing_policy: lb_policies.LoadBalancingPolicy = ( + self._app = fastapi.FastAPI() + self._controller_url = controller_url + self._load_balancer_port = load_balancer_port + self._replica_port = replica_port + self._load_balancing_policy: lb_policies.LoadBalancingPolicy = ( lb_policies.RoundRobinPolicy()) - self.request_information: serve_utils.RequestInformation = ( + self._request_aggregator: serve_utils.RequestsAggregator = ( serve_utils.RequestTimestamp()) def _sync_with_controller(self): """Sync with controller periodically. - Every `constants.LB_CONTROLLER_SYNC_INTERVAL` seconds, the load balancer - will sync with the controller to get the latest information about - available replicas; also, it report the request information to the - controller, so that the controller can make autoscaling decisions. + Every `constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS` seconds, the + load balancer will sync with the controller to get the latest + information about available replicas; also, it report the request + information to the controller, so that the controller can make + autoscaling decisions. """ # Sleep for a while to wait the controller bootstrap. time.sleep(5) @@ -59,28 +63,28 @@ def _sync_with_controller(self): try: # Send request information response = session.post( - self.controller_url + '/controller/load_balancer_sync', + self._controller_url + '/controller/load_balancer_sync', json={ - 'request_information': base64.b64encode( - pickle.dumps(self.request_information) + 'request_aggregator': base64.b64encode( + pickle.dumps(self._request_aggregator) ).decode('utf-8') }, timeout=5) # Clean up after reporting request information to avoid OOM. - self.request_information.clear() + self._request_aggregator.clear() response.raise_for_status() ready_replica_ips = response.json().get('ready_replica_ips') except requests.RequestException as e: print(f'An error occurred: {e}') else: logger.info(f'Available Replica IPs: {ready_replica_ips}') - self.load_balancing_policy.set_ready_replicas( + self._load_balancing_policy.set_ready_replicas( ready_replica_ips) - time.sleep(constants.LB_CONTROLLER_SYNC_INTERVAL) + time.sleep(constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS) async def _redirect_handler(self, request: fastapi.Request): - self.request_information.add(request) - replica_ip = self.load_balancing_policy.select_replica(request) + self._request_aggregator.add(request) + replica_ip = self._load_balancing_policy.select_replica(request) if replica_ip is None: raise fastapi.HTTPException(status_code=503, @@ -88,16 +92,16 @@ async def _redirect_handler(self, request: fastapi.Request): 'Use "sky serve status [SERVICE_ID]" ' 'to check the replica status.') - path = f'http://{replica_ip}:{self.replica_port}{request.url.path}' + path = f'http://{replica_ip}:{self._replica_port}{request.url.path}' logger.info(f'Redirecting request to {path}') return fastapi.responses.RedirectResponse(url=path) def run(self): - self.app.add_api_route('/{path:path}', - self._redirect_handler, - methods=['GET', 'POST', 'PUT', 'DELETE']) + self._app.add_api_route('/{path:path}', + self._redirect_handler, + methods=['GET', 'POST', 'PUT', 'DELETE']) - @self.app.on_event('startup') + @self._app.on_event('startup') def configure_logger(): uvicorn_access_logger = logging.getLogger('uvicorn.access') for handler in uvicorn_access_logger.handlers: @@ -106,9 +110,9 @@ def configure_logger(): threading.Thread(target=self._sync_with_controller, daemon=True).start() logger.info('SkyServe Load Balancer started on ' - f'http://0.0.0.0:{self.load_balancer_port}') + f'http://0.0.0.0:{self._load_balancer_port}') - uvicorn.run(self.app, host='0.0.0.0', port=self.load_balancer_port) + uvicorn.run(self._app, host='0.0.0.0', port=self._load_balancer_port) def run_load_balancer(controller_addr: str, load_balancer_port: int, diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index 4aa3e46f5f4..038d33759c2 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -1,5 +1,5 @@ """LoadBalancingPolicy: Policy to select endpoint.""" -from typing import List, Optional, Set +from typing import List, Optional import fastapi @@ -12,9 +12,9 @@ class LoadBalancingPolicy: """Abstract class for load balancing policies.""" def __init__(self) -> None: - self.ready_replicas: Set[str] = set() + self.ready_replicas: List[str] = [] - def set_ready_replicas(self, ready_replicas: Set[str]) -> None: + def set_ready_replicas(self, ready_replicas: List[str]) -> None: raise NotImplementedError def select_replica(self, request: fastapi.Request) -> Optional[str]: @@ -26,20 +26,18 @@ class RoundRobinPolicy(LoadBalancingPolicy): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.replicas_queue: List[str] = [] self.index = 0 - def set_ready_replicas(self, ready_replicas: Set[str]) -> None: + def set_ready_replicas(self, ready_replicas: List[str]) -> None: if set(ready_replicas) != set(self.ready_replicas): self.ready_replicas = ready_replicas - self.replicas_queue = list(ready_replicas) self.index = 0 def select_replica(self, request: fastapi.Request) -> Optional[str]: - if not self.replicas_queue: + if not self.ready_replicas: return None - replica_ip = self.replicas_queue[self.index] - self.index = (self.index + 1) % len(self.replicas_queue) + replica_ip = self.ready_replicas[self.index] + self.index = (self.index + 1) % len(self.ready_replicas) request_repr = (' bool: + """Whether to remove the replica record from the replica table. + + If not, the replica will stay in the replica table permanently to + notify the user that something is wrong with the user code / setup. + """ if self.sky_launch_status != ProcessStatus.SUCCEEDED: return False if self.sky_down_status != ProcessStatus.SUCCEEDED: @@ -210,6 +215,12 @@ def is_scale_down_succeeded(self, initial_delay_seconds: int, return self.first_ready_time is not None def should_track_status(self) -> bool: + """Should we track the status of the replica. + + This includes: + (1) Job status; + (2) Readiness probe. + """ if self.sky_launch_status != ProcessStatus.SUCCEEDED: return False if self.sky_down_status is not None: @@ -221,6 +232,7 @@ def should_track_status(self) -> bool: return True def to_replica_status(self) -> serve_state.ReplicaStatus: + """Convert status property to human-readable replica status.""" if self.sky_launch_status == ProcessStatus.RUNNING: # Still launching return serve_state.ReplicaStatus.PROVISIONING @@ -318,7 +330,7 @@ def to_info_dict(self, with_handle: bool) -> Dict[str, Any]: def probe( self, - readiness_suffix: str, + readiness_route: str, post_data: Optional[Dict[str, Any]], ) -> Tuple['ReplicaInfo', bool, float]: """Probe the readiness of the replica. @@ -331,18 +343,18 @@ def probe( try: msg = '' # TODO(tian): Support HTTPS in the future. - readiness_path = f'http://{self.ip}{readiness_suffix}' + readiness_path = f'http://{self.ip}{readiness_route}' if post_data is not None: msg += 'POST' response = requests.post( readiness_path, json=post_data, - timeout=serve_constants.READINESS_PROBE_TIMEOUT) + timeout=serve_constants.READINESS_PROBE_TIMEOUT_SECONDS) else: msg += 'GET' response = requests.get( readiness_path, - timeout=serve_constants.READINESS_PROBE_TIMEOUT) + timeout=serve_constants.READINESS_PROBE_TIMEOUT_SECONDS) msg += (f' request to {replica_identity} returned status ' f'code {response.status_code}') if response.status_code == 200: @@ -366,18 +378,18 @@ class ReplicaManager: def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec') -> None: self.lock = threading.Lock() - self.next_replica_id: int = 1 - self.service_name: str = service_name - self.auto_restart = spec.auto_restart - self.readiness_suffix: str = spec.readiness_suffix - self.initial_delay_seconds: int = spec.initial_delay_seconds - self.post_data: Optional[Dict[str, Any]] = spec.post_data - self.uptime: Optional[float] = None - logger.info(f'Readiness probe suffix: {self.readiness_suffix}') - logger.info(f'Initial delay seconds: {self.initial_delay_seconds}') - logger.info(f'Post data: {self.post_data}') - - def get_ready_replica_ips(self) -> Set[str]: + self._next_replica_id: int = 1 + self._service_name: str = service_name + self._auto_restart = spec.auto_restart + self._readiness_route: str = spec.readiness_route + self._initial_delay_seconds: int = spec.initial_delay_seconds + self._post_data: Optional[Dict[str, Any]] = spec.post_data + self._uptime: Optional[float] = None + logger.info(f'Readiness probe suffix: {self._readiness_route}') + logger.info(f'Initial delay seconds: {self._initial_delay_seconds}') + logger.info(f'Post data: {self._post_data}') + + def get_ready_replica_ips(self) -> List[str]: """Get all ready replica's IP addresses.""" raise NotImplementedError @@ -391,15 +403,24 @@ def scale_down(self, replica_ids: List[int]) -> None: class SkyPilotReplicaManager(ReplicaManager): - """Replica Manager for SkyPilot clusters.""" + """Replica Manager for SkyPilot clusters. + + It will run three daemon to monitor the status of the replicas: + (1) _process_pool_refresher: Refresh the launch/down process pool + to monitor the progress of the launch/down process. + (2) _job_status_fetcher: Fetch the job status of the service to + monitor the status of the service jobs. + (3) _replica_prober: Do readiness probe to the replicas to monitor + whether it is still responding to requests. + """ def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', task_yaml_path: str) -> None: super().__init__(service_name, spec) - self.task_yaml_path = task_yaml_path - self.launch_process_pool: serve_utils.ThreadSafeDict[ + self._task_yaml_path = task_yaml_path + self._launch_process_pool: serve_utils.ThreadSafeDict[ int, multiprocessing.Process] = serve_utils.ThreadSafeDict() - self.down_process_pool: serve_utils.ThreadSafeDict[ + self._down_process_pool: serve_utils.ThreadSafeDict[ int, multiprocessing.Process] = serve_utils.ThreadSafeDict() threading.Thread(target=self._process_pool_refresher).start() @@ -410,44 +431,44 @@ def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', # Replica management functions # ################################ - def get_ready_replica_ips(self) -> Set[str]: - ready_replicas = set() - infos = serve_state.get_replica_infos(self.service_name) + def get_ready_replica_ips(self) -> List[str]: + ready_replicas = [] + infos = serve_state.get_replica_infos(self._service_name) for info in infos: if info.status == serve_state.ReplicaStatus.READY: assert info.ip is not None - ready_replicas.add(info.ip) + ready_replicas.append(info.ip) return ready_replicas def _launch_replica(self, replica_id: int) -> None: - if replica_id in self.launch_process_pool: + if replica_id in self._launch_process_pool: logger.warning(f'Launch process for replica {replica_id} ' 'already exists. Skipping.') return logger.info(f'Launching replica {replica_id}...') cluster_name = serve_utils.generate_replica_cluster_name( - self.service_name, replica_id) + self._service_name, replica_id) log_file_name = serve_utils.generate_replica_launch_log_file_name( - self.service_name, replica_id) + self._service_name, replica_id) p = multiprocessing.Process( target=serve_utils.RedirectOutputTo( launch_cluster, log_file_name, ).run, - args=(self.task_yaml_path, cluster_name), + args=(self._task_yaml_path, cluster_name), ) p.start() - self.launch_process_pool[replica_id] = p + self._launch_process_pool[replica_id] = p info = ReplicaInfo(replica_id, cluster_name) - serve_state.add_or_update_replica(self.service_name, replica_id, info) + serve_state.add_or_update_replica(self._service_name, replica_id, info) def scale_up(self, n: int) -> None: for _ in range(n): - self._launch_replica(self.next_replica_id) - self.next_replica_id += 1 + self._launch_replica(self._next_replica_id) + self._next_replica_id += 1 def _terminate_replica(self, replica_id: int, sync_down_logs: bool) -> None: - if replica_id in self.down_process_pool: + if replica_id in self._down_process_pool: logger.warning(f'Terminate process for replica {replica_id} ' 'already exists. Skipping.') return @@ -455,10 +476,10 @@ def _terminate_replica(self, replica_id: int, sync_down_logs: bool) -> None: def _download_and_stream_logs(info: ReplicaInfo): launch_log_file_name = ( serve_utils.generate_replica_launch_log_file_name( - self.service_name, replica_id)) + self._service_name, replica_id)) local_log_file_name = ( serve_utils.generate_replica_local_log_file_name( - self.service_name, replica_id)) + self._service_name, replica_id)) # Write launch log to local log file with open(local_log_file_name, 'w') as local_file, open(launch_log_file_name, @@ -488,7 +509,7 @@ def _download_and_stream_logs(info: ReplicaInfo): local_file.write(job_file.read()) logger.info(f'Terminating replica {replica_id}...') - info = serve_state.get_replica_info_from_id(self.service_name, + info = serve_state.get_replica_info_from_id(self._service_name, replica_id) assert info is not None @@ -498,7 +519,7 @@ def _download_and_stream_logs(info: ReplicaInfo): logger.info(f'preempted: {info.status_property.preempted}, ' f'replica_id: {replica_id}') log_file_name = serve_utils.generate_replica_down_log_file_name( - self.service_name, replica_id) + self._service_name, replica_id) p = multiprocessing.Process( target=serve_utils.RedirectOutputTo( terminate_cluster, @@ -507,9 +528,9 @@ def _download_and_stream_logs(info: ReplicaInfo): args=(info.cluster_name,), ) p.start() - self.down_process_pool[replica_id] = p + self._down_process_pool[replica_id] = p info.status_property.sky_down_status = ProcessStatus.RUNNING - serve_state.add_or_update_replica(self.service_name, replica_id, info) + serve_state.add_or_update_replica(self._service_name, replica_id, info) def scale_down(self, replica_ids: List[int]) -> None: for replica_id in replica_ids: @@ -518,11 +539,11 @@ def scale_down(self, replica_ids: List[int]) -> None: def _recover_from_preemption(self, replica_id: int) -> None: logger.info(f'Beginning recovery for preempted replica {replica_id}.') # TODO(MaoZiming): Support spot recovery policies - info = serve_state.get_replica_info_from_id(self.service_name, + info = serve_state.get_replica_info_from_id(self._service_name, replica_id) assert info is not None info.status_property.preempted = True - serve_state.add_or_update_replica(self.service_name, replica_id, info) + serve_state.add_or_update_replica(self._service_name, replica_id, info) self._terminate_replica(replica_id, sync_down_logs=False) ################################# @@ -537,7 +558,7 @@ def _refresh_process_pool(self) -> None: the fly. If any of them finished, it will update the status of the corresponding replica. """ - for replica_id, p in list(self.launch_process_pool.items()): + for replica_id, p in list(self._launch_process_pool.items()): if not p.is_alive(): # TODO(tian): Try-catch in process, and have an enum return # value to indicate which type of failure happened. @@ -546,9 +567,9 @@ def _refresh_process_pool(self) -> None: # when we enable user choose whether to retry or not. logger.info( f'Launch process for replica {replica_id} finished.') - del self.launch_process_pool[replica_id] + del self._launch_process_pool[replica_id] info = serve_state.get_replica_info_from_id( - self.service_name, replica_id) + self._service_name, replica_id) assert info is not None if p.exitcode != 0: logger.warning( @@ -560,15 +581,15 @@ def _refresh_process_pool(self) -> None: else: info.status_property.sky_launch_status = ( ProcessStatus.SUCCEEDED) - serve_state.add_or_update_replica(self.service_name, replica_id, - info) - for replica_id, p in list(self.down_process_pool.items()): + serve_state.add_or_update_replica(self._service_name, + replica_id, info) + for replica_id, p in list(self._down_process_pool.items()): if not p.is_alive(): logger.info( f'Terminate process for replica {replica_id} finished.') - del self.down_process_pool[replica_id] + del self._down_process_pool[replica_id] info = serve_state.get_replica_info_from_id( - self.service_name, replica_id) + self._service_name, replica_id) assert info is not None if p.exitcode != 0: logger.error(f'Down process for replica {replica_id} ' @@ -590,12 +611,12 @@ def _refresh_process_pool(self) -> None: # initial_delay_seconds is not supported. We should add it # later when we support `sky serve update`. if info.status_property.is_scale_down_succeeded( - self.initial_delay_seconds, self.auto_restart): + self._initial_delay_seconds, self._auto_restart): # This means the cluster is deleted due to # a scale down or the cluster is recovering # from preemption. Delete the replica info # so it won't count as a replica. - serve_state.remove_replica(self.service_name, replica_id) + serve_state.remove_replica(self._service_name, replica_id) if info.status_property.preempted: removal_reason = 'for preemption recovery' else: @@ -606,7 +627,7 @@ def _refresh_process_pool(self) -> None: logger.info(f'Termination of replica {replica_id} ' 'finished. Replica info is kept since some ' 'failure detected.') - serve_state.add_or_update_replica(self.service_name, + serve_state.add_or_update_replica(self._service_name, replica_id, info) def _process_pool_refresher(self) -> None: @@ -629,7 +650,7 @@ def _fetch_job_status(self) -> None: to make sure the service is running correctly. If any of the replicas failed, it will terminate the replica. """ - infos = serve_state.get_replica_infos(self.service_name) + infos = serve_state.get_replica_infos(self._service_name) for info in infos: if not info.status_property.should_track_status(): continue @@ -647,7 +668,7 @@ def _fetch_job_status(self) -> None: job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP ]: info.status_property.user_app_failed = True - serve_state.add_or_update_replica(self.service_name, + serve_state.add_or_update_replica(self._service_name, info.replica_id, info) logger.warning( f'Service job for replica {info.replica_id} FAILED. ' @@ -679,7 +700,7 @@ def _probe_all_replicas(self) -> None: probe_futures = [] replica_to_probe = [] with futures.ThreadPoolExecutor() as executor: - infos = serve_state.get_replica_infos(self.service_name) + infos = serve_state.get_replica_infos(self._service_name) for info in infos: if not info.status_property.should_track_status(): continue @@ -688,8 +709,8 @@ def _probe_all_replicas(self) -> None: probe_futures.append( executor.submit( info.probe, - self.readiness_suffix, - self.post_data, + self._readiness_route, + self._post_data, )) logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}') @@ -703,12 +724,12 @@ def _probe_all_replicas(self) -> None: info.status_property.service_ready_now = probe_succeeded should_teardown = False if probe_succeeded: - if self.uptime is None: - self.uptime = probe_time + if self._uptime is None: + self._uptime = probe_time logger.info(f'Replica {info.replica_id} is the first ready ' - f'replica. Setting uptime to {self.uptime}.') - serve_state.set_service_uptime(self.service_name, - int(self.uptime)) + f'replica. Setting uptime to {self._uptime}.') + serve_state.set_service_uptime(self._service_name, + int(self._uptime)) info.consecutive_failure_times.clear() if info.status_property.first_ready_time is None: info.status_property.first_ready_time = probe_time @@ -767,7 +788,7 @@ def _probe_all_replicas(self) -> None: else: current_delay_seconds = (probe_time - info.first_not_ready_time) - if current_delay_seconds > self.initial_delay_seconds: + if current_delay_seconds > self._initial_delay_seconds: logger.info( f'Replica {info.replica_id} is not ready and ' 'exceeding initial delay seconds. Terminating ' @@ -778,8 +799,8 @@ def _probe_all_replicas(self) -> None: logger.info( f'Replica {info.replica_id} is not ready but within' f' initial delay seconds ({current_delay_seconds}s ' - f'/ {self.initial_delay_seconds}s). Skipping.') - serve_state.add_or_update_replica(self.service_name, + f'/ {self._initial_delay_seconds}s). Skipping.') + serve_state.add_or_update_replica(self._service_name, info.replica_id, info) if should_teardown: self._terminate_replica(info.replica_id, sync_down_logs=True) @@ -792,13 +813,13 @@ def _replica_prober(self) -> None: self._probe_all_replicas() replica_statuses = [ info['status'] - for info in serve_utils.get_replica_info(self.service_name, + for info in serve_utils.get_replica_info(self._service_name, with_handle=False) ] serve_utils.set_service_status_from_replica_statuses( - self.service_name, replica_statuses) + self._service_name, replica_statuses) except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # replica prober running. logger.error(f'Error in replica prober: {e}') - time.sleep(serve_constants.ENDPOINT_PROBE_INTERVAL) + time.sleep(serve_constants.ENDPOINT_PROBE_INTERVAL_SECONDS) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 9a3ca84b378..42a6a0a6476 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -112,42 +112,45 @@ def values(self): return self._dict.values() -class RequestInformation: - """Base class for request information.""" +class RequestsAggregator: + """Base class for request aggregator.""" def add(self, request: 'fastapi.Request') -> None: - """Add a request to the request information.""" + """Add a request to the request aggregator.""" raise NotImplementedError def get(self) -> List[Any]: - """Get all current request information.""" + """Get all current request aggregator.""" raise NotImplementedError def clear(self) -> None: - """Clear all current request information.""" + """Clear all current request aggregator.""" raise NotImplementedError def __repr__(self) -> str: raise NotImplementedError -class RequestTimestamp(RequestInformation): - """RequestTimestamp: Request information that stores request timestamps.""" +class RequestTimestamp(RequestsAggregator): + """RequestTimestamp: Aggregates request timestamps. + + This is useful for QPS-based autoscaling. + """ def __init__(self) -> None: self.timestamps: List[float] = [] def add(self, request: 'fastapi.Request') -> None: - """Add a request to the request information.""" + """Add a request to the request aggregator.""" del request # unused self.timestamps.append(time.time()) def get(self) -> List[float]: - """Get all current request information.""" + """Get all current request aggregator.""" return self.timestamps def clear(self) -> None: - """Clear all current request information.""" + """Clear all current request aggregator.""" self.timestamps = [] def __repr__(self) -> str: @@ -198,6 +201,12 @@ def generate_remote_task_yaml_file_name(service_name: str) -> str: return os.path.join(dir_name, 'task.yaml') +def generate_remote_config_yaml_file_name(service_name: str) -> str: + dir_name = generate_remote_service_dir_name(service_name) + # Don't expand here since it is used for remote machine. + return os.path.join(dir_name, 'config.yaml') + + def generate_remote_controller_log_file_name(service_name: str) -> str: dir_name = generate_remote_service_dir_name(service_name) # Don't expand here since it is used for remote machine. @@ -299,14 +308,14 @@ def get_replica_info(service_name: str, def get_serve_status(service_name: str, with_replica_info: bool = True) -> Dict[str, Any]: - """Get the latest information of the service. + """Get the status dict of the service. Args: service_name: The name of the service. with_replica_info: Whether to include the information of all replicas. Returns: - A dictionary of latest information of the service. + A dictionary, describing the status of the service. """ record = serve_state.get_service_from_name(service_name) if record is None: diff --git a/sky/serve/service.py b/sky/serve/service.py index 9e0ef0cc81a..4e37bd25da6 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -1,4 +1,7 @@ -"""Service: Control both the controller and load balancer.""" +"""Main entrypoint to start a service. + +This including the controller and load balancer. +""" import argparse import multiprocessing import os diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 69c21ec1c11..1b3a9e93aa7 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -190,7 +190,7 @@ def __repr__(self) -> str: """) @property - def readiness_suffix(self) -> str: + def readiness_route(self) -> str: return f':{self._replica_port}{self._readiness_path}' @property @@ -211,6 +211,7 @@ def min_replicas(self) -> int: @property def max_replicas(self) -> Optional[int]: + # If None, treated as having the same value of min_replicas. return self._max_replicas @property diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index b2a2583d637..f94034ca986 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -96,6 +96,9 @@ # and hyphens. We use this regex to validate the cluster name. CLUSTER_NAME_VALID_REGEX = '[a-z]([-a-z0-9]*[a-z0-9])?' +# Used for translate local file mounts to cloud storage. Please refer to +# sky/execution.py::_maybe_translate_local_file_mounts_and_sync_up for +# more details. WORKDIR_BUCKET_NAME = 'skypilot-workdir-{username}-{id}' FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-folder-{username}-{id}' FILE_MOUNTS_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}' diff --git a/sky/skylet/events.py b/sky/skylet/events.py index 2d0ef87d551..ade43104048 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -74,7 +74,11 @@ def _run(self): class ServiceUpdateEvent(SkyletEvent): - """Skylet event for updating sky serve service status.""" + """Skylet event for updating sky serve service status. + + This is needed to handle the case that controller process is somehow + terminated and the service status is not updated. + """ EVENT_INTERVAL_SECONDS = 300 def _run(self): diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index f5562a8bd49..f7e7c6ae2f7 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -14,6 +14,9 @@ setup: | file_mounts: {{remote_task_yaml_path}}: {{local_task_yaml_path}} +{% if user_config_path is not none %} + {{remote_user_config_path}}: {{user_config_path}} +{% endif %} run: | # Start sky serve service. diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index 5f28ebc29b4..26f9e22f65d 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -5,7 +5,7 @@ name: {{dag_name}} file_mounts: {{remote_user_yaml_prefix}}/{{dag_name}}-{{uuid}}.yaml: {{user_yaml_path}} {% if user_config_path is not none %} - {{remote_user_config_path}}: {{user_config_path}} + {{remote_user_yaml_prefix}}/{{remote_user_config_path}}: {{user_config_path}} {% endif %} setup: | diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index ed64dd1d468..ff8ce9ee69c 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -21,7 +21,7 @@ _ClusterRecord = Dict[str, Any] # A record returned by core.cost_report(); see its docstr for all fields. _ClusterCostReportRecord = Dict[str, Any] -# A record in global_user_state's 'services' table. +# A record in serve_state's 'services' table. _ServiceRecord = Dict[str, Any] _ReplicaRecord = Dict[str, Any] diff --git a/sky/utils/subprocess_utils.py b/sky/utils/subprocess_utils.py index decba03c5ef..a77a840b89a 100644 --- a/sky/utils/subprocess_utils.py +++ b/sky/utils/subprocess_utils.py @@ -99,7 +99,7 @@ def kill_children_processes( This is for guaranteeing the order of cleaning up and suppress flaky errors. """ - first_to_kill_pid_to_processes = dict() + pid2proc = dict() child_processes = [] if isinstance(first_pid_to_kill, int): first_pid_to_kill = [first_pid_to_kill] @@ -120,15 +120,12 @@ def _kill_processes(processes: List[psutil.Process]) -> None: parent_process = psutil.Process() for child in parent_process.children(recursive=True): if child.pid in first_pid_to_kill: - first_to_kill_pid_to_processes[child.pid] = child + pid2proc[child.pid] = child else: child_processes.append(child) - _kill_processes([ - first_to_kill_pid_to_processes[proc] - for proc in first_pid_to_kill - if proc in first_to_kill_pid_to_processes - ]) + _kill_processes( + [pid2proc[proc] for proc in first_pid_to_kill if proc in pid2proc]) _kill_processes(child_processes) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8c8fbb3acaa..411860c5e19 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2631,16 +2631,16 @@ def _get_service_name() -> str: # `REPLICAS` is in the form of `1/2` where the first number is the number of # ready replicas and the second number is the number of total replicas. We # grep such format to ensure that the service is ready, and early exit if any -# failure detected. In the end we sleep for serve.LB_CONTROLLER_SYNC_INTERVAL to -# make sure load balancer have enough time to sync with the controller and get -# all ready replica IPs. +# failure detected. In the end we sleep for +# serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have +# enough time to sync with the controller and get all ready replica IPs. _SERVE_WAIT_UNTIL_READY = ( '(while true; do' ' output=$(sky serve status {name});' ' echo "$output" | grep -q "{replica_num}/{replica_num}" && break;' ' echo "$output" | grep -q "FAILED" && exit 1;' ' sleep 10;' - f' done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL};') + f' done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};') _IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}' _ENDPOINT_REGEX = _IP_REGEX + r':[0-9]{1,5}' _AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag' @@ -2806,12 +2806,14 @@ def terminate_replica(replica_id: int) -> str: return (f'gcloud compute instances delete --zone={zone}' f' --quiet $({query_cmd})') - # In the worst case, the controller will first wait ENDPOINT_PROBE_INTERVAL - # for next probe, and wait LB_CONTROLLER_SYNC_INTERVAL for load balancer's - # next sync with controller. We add 5s more for any overhead, such as - # database read/write. - time_to_wait_after_terminate = (serve.ENDPOINT_PROBE_INTERVAL + - serve.LB_CONTROLLER_SYNC_INTERVAL + 5) + # In the worst case, the controller will first wait + # ENDPOINT_PROBE_INTERVAL_SECONDS for next probe, and wait + # LB_CONTROLLER_SYNC_INTERVAL_SECONDS for load balancer's + # next sync with controller. We add 5s more for any overhead, + # such as database read/write. + time_to_wait_after_terminate = (serve.ENDPOINT_PROBE_INTERVAL_SECONDS + + serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS + + 5) test = Test( f'test-skyserve-replica-failure', @@ -2877,7 +2879,7 @@ def terminate_replica(replica_id: int) -> str: f' output=$(sky serve status {name});' ' echo "$output" | grep -q "1/1" && break;' ' sleep 10;' - f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL};', + f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};', f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', ], f'sky serve down -y {name}', From 330b89316bad30637df46e54cc6a351cab175c3e Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 8 Nov 2023 18:08:31 -0800 Subject: [PATCH 175/223] format --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 411860c5e19..f523d4887a2 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2631,7 +2631,7 @@ def _get_service_name() -> str: # `REPLICAS` is in the form of `1/2` where the first number is the number of # ready replicas and the second number is the number of total replicas. We # grep such format to ensure that the service is ready, and early exit if any -# failure detected. In the end we sleep for +# failure detected. In the end we sleep for # serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have # enough time to sync with the controller and get all ready replica IPs. _SERVE_WAIT_UNTIL_READY = ( From 2aa266be43bd4740bf187428e2746c27f1a0bf07 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 10:57:07 -0800 Subject: [PATCH 176/223] move controller related functions/classes to controller_utils --- sky/backends/backend_utils.py | 245 ------------------------ sky/backends/cloud_vm_ray_backend.py | 10 +- sky/backends/onprem_utils.py | 4 +- sky/cli.py | 57 +++--- sky/core.py | 47 ++--- sky/data/storage_utils.py | 47 +---- sky/execution.py | 23 ++- sky/serve/__init__.py | 1 + sky/serve/replica_managers.py | 11 +- sky/serve/serve_utils.py | 7 +- sky/spot/controller.py | 10 +- sky/spot/spot_utils.py | 3 +- sky/utils/cli_utils/status_utils.py | 55 +++++- sky/utils/controller_utils.py | 268 +++++++++++++++++++++++++++ 14 files changed, 412 insertions(+), 376 deletions(-) create mode 100644 sky/utils/controller_utils.py diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 37a66bbd9ab..8e4cffd4fb9 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,5 +1,4 @@ """Util constants/functions for the backends.""" -import dataclasses from datetime import datetime import enum import getpass @@ -38,7 +37,6 @@ from sky import serve as serve_lib from sky import sky_logging from sky import skypilot_config -from sky import spot as spot_lib from sky import status_lib from sky.backends import onprem_utils from sky.provision import instance_setup @@ -99,96 +97,6 @@ # Note: This value cannot be too small, otherwise OOM issue may occur. DEFAULT_TASK_CPU_DEMAND = 0.5 -# The default idle timeout for skypilot controllers. This include spot -# controller and sky serve controller. -CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 - - -@dataclasses.dataclass -class _ControllerSpec: - """Spec for skypilot controllers.""" - name: str - cluster_name: str - sky_status_hint: str - decline_cancel_hint: str - decline_down_in_init_status_hint: str - decline_down_for_dirty_controller_hint: str - check_cluster_name_hint: str - default_hint_if_non_existent: str - - -class Controllers(enum.Enum): - """Skypilot controllers.""" - # NOTE(dev): Keep this align with - # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE - SPOT_CONTROLLER = _ControllerSpec( - name='managed spot controller', - cluster_name=spot_lib.SPOT_CONTROLLER_NAME, - sky_status_hint=( - f'* To see detailed spot job status: {colorama.Style.BRIGHT}' - f'sky spot queue{colorama.Style.RESET_ALL}'), - decline_cancel_hint=( - 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' - f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel [--all]{colorama.Style.RESET_ALL}'), - decline_down_in_init_status_hint=( - f'{colorama.Fore.RED}Tearing down the spot controller while ' - 'it is in INIT state is not supported (this means a spot launch ' - 'is in progress or the previous launch failed), as we cannot ' - 'guarantee that all the spot jobs are finished. Please wait ' - 'until the spot controller is UP or fix it with ' - f'{colorama.Style.BRIGHT}sky start ' - f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), - decline_down_for_dirty_controller_hint=( - f'{colorama.Fore.RED}In-progress spot jobs found. To avoid ' - f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}' - f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), - check_cluster_name_hint=( - f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' - 'managed spot controller. '), - default_hint_if_non_existent='No managed spot jobs are found.') - SKY_SERVE_CONTROLLER = _ControllerSpec( - name='sky serve controller', - cluster_name=serve_lib.SKY_SERVE_CONTROLLER_NAME, - sky_status_hint=( - f'* To see detailed service status: {colorama.Style.BRIGHT}' - f'sky serve status{colorama.Style.RESET_ALL}'), - decline_cancel_hint=( - 'Cancelling the sky serve controller\'s jobs is not allowed.'), - decline_down_in_init_status_hint=( - f'{colorama.Fore.RED}Tearing down the sky serve controller ' - 'while it is in INIT state is not supported (this means a sky ' - 'serve up is in progress or the previous launch failed), as we ' - 'cannot guarantee that all the services are terminated. Please ' - 'wait until the sky serve controller is UP or fix it with ' - f'{colorama.Style.BRIGHT}sky start ' - f'{serve_lib.SKY_SERVE_CONTROLLER_NAME}' - f'{colorama.Style.RESET_ALL}.'), - decline_down_for_dirty_controller_hint=( - f'{colorama.Fore.RED}Tearing down the sky serve controller is not ' - 'supported, as it is currently serving the following services: ' - '{service_names}. Please terminate the services first with ' - f'{colorama.Style.BRIGHT}sky serve down -a' - f'{colorama.Style.RESET_ALL}.'), - check_cluster_name_hint=( - f'Cluster {serve_lib.SKY_SERVE_CONTROLLER_NAME} is reserved for ' - 'sky serve controller. '), - default_hint_if_non_existent='No service is found.') - - @classmethod - def check_cluster_name(cls, name: Optional[str]) -> Optional['Controllers']: - """Check if the cluster name is a controller name. - - Returns: - The controller if the cluster name is a controller name. - Otherwise, returns None. - """ - for controller in cls: - if controller.value.cluster_name == name: - return controller - return None - - # Filelocks for the cluster status change. CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock') CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20 @@ -1560,10 +1468,6 @@ def generate_cluster_name(): return f'sky-{uuid.uuid4().hex[:4]}-{get_cleaned_username()}' -def generate_service_name(): - return f'sky-service-{uuid.uuid4().hex[:4]}' - - def get_cleaned_username(username: str = '') -> str: """Cleans the username to be used as part of a cluster name. @@ -2585,77 +2489,6 @@ def check_cluster_available( return handle -def is_controller_up( - controller_type: Controllers, - stopped_message: str, - non_existent_message: Optional[str] = None, -) -> Tuple[Optional[status_lib.ClusterStatus], - Optional['backends.CloudVmRayResourceHandle']]: - """Check if the spot/serve controller is up. - - It can be used to check the actual controller status (since the autostop is - set for the controller) before the spot/serve commands interact with the - controller. - - Args: - type: Type of the controller. - stopped_message: Message to print if the controller is STOPPED. - non_existent_message: Message to show if the controller does not exist. - - Returns: - controller_status: The status of the controller. If it fails during - refreshing the status, it will be the cached status. None if the - controller does not exist. - handle: The ResourceHandle of the controller. None if the - controller is not UP or does not exist. - - Raises: - exceptions.ClusterOwnerIdentityMismatchError: if the current user is not - the same as the user who created the cluster. - exceptions.CloudUserIdentityError: if we fail to get the current user - identity. - """ - if non_existent_message is None: - non_existent_message = controller_type.value.default_hint_if_non_existent - cluster_name = controller_type.value.cluster_name - controller_name = controller_type.value.name.replace(' controller', '') - try: - # Set force_refresh_statuses=None to make sure the refresh only happens - # when the controller is INIT/UP (triggered in these statuses as the - # autostop is always set for the controller). This optimization avoids - # unnecessary costly refresh when the controller is already stopped. - # This optimization is based on the assumption that the user will not - # start the controller manually from the cloud console. - controller_status, handle = refresh_cluster_status_handle( - cluster_name, force_refresh_statuses=None) - except exceptions.ClusterStatusFetchingError as e: - # We do not catch the exceptions related to the cluster owner identity - # mismatch, please refer to the comment in - # `backend_utils.check_cluster_available`. - logger.warning( - 'Failed to get the status of the controller. It is not ' - f'fatal, but {controller_name} commands/calls may hang or return ' - 'stale information, when the controller is not up.\n' - f' Details: {common_utils.format_exception(e, use_bracket=True)}') - record = global_user_state.get_cluster_from_name(cluster_name) - controller_status, handle = None, None - if record is not None: - controller_status, handle = record['status'], record['handle'] - - if controller_status is None: - sky_logging.print(non_existent_message) - elif controller_status != status_lib.ClusterStatus.UP: - msg = (f'{controller_name.capitalize()} controller {cluster_name} ' - f'is {controller_status.value}.') - if controller_status == status_lib.ClusterStatus.STOPPED: - msg += f'\n{stopped_message}' - if controller_status == status_lib.ClusterStatus.INIT: - msg += '\nPlease wait for the controller to be ready.' - sky_logging.print(msg) - handle = None - return controller_status, handle - - class CloudFilter(enum.Enum): # Filter for all types of clouds. ALL = 'all' @@ -2666,7 +2499,6 @@ class CloudFilter(enum.Enum): def get_clusters( - include_reserved: bool, refresh: bool, cloud_filter: CloudFilter = CloudFilter.CLOUDS_AND_DOCKER, cluster_names: Optional[Union[str, List[str]]] = None, @@ -2679,8 +2511,6 @@ def get_clusters( of the clusters. Args: - include_reserved: Whether to include reserved clusters, e.g. spot - controller. refresh: Whether to refresh the status of the clusters. (Refreshing will set the status to STOPPED if the cluster cannot be pinged.) cloud_filter: Sets which clouds to filer through from the global user @@ -2695,12 +2525,6 @@ def get_clusters( """ records = global_user_state.get_clusters() - if not include_reserved: - records = [ - record for record in records - if Controllers.check_cluster_name(record['name']) is None - ] - yellow = colorama.Fore.YELLOW bright = colorama.Style.BRIGHT reset = colorama.Style.RESET_ALL @@ -2808,51 +2632,6 @@ def _refresh_cluster(cluster_name): return kept_records -# Internal only: -def download_and_stream_latest_job_log( - backend: 'cloud_vm_ray_backend.CloudVmRayBackend', - handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle', local_dir: str, - log_position_hint: str, log_finish_hint: str) -> Optional[str]: - """Downloads and streams the latest job log. - - This function is only used by spot controller and sky serve controller. - """ - os.makedirs(local_dir, exist_ok=True) - log_file = None - try: - log_dirs = backend.sync_down_logs( - handle, - # Download the log of the latest job. - # The job_id for the spot job running on the spot cluster is not - # necessarily 1, as it is possible that the worker node in a - # multi-node cluster is preempted, and we recover the spot job - # on the existing cluster, which leads to a larger job_id. Those - # job_ids all represent the same logical spot job. - job_ids=None, - local_dir=local_dir) - except exceptions.CommandError as e: - logger.info(f'Failed to download the logs: ' - f'{common_utils.format_exception(e)}') - else: - if not log_dirs: - logger.error('Failed to find the logs for the user program in ' - f'the {log_position_hint}.') - else: - log_dir = list(log_dirs.values())[0] - log_file = os.path.join(log_dir, 'run.log') - - # Print the logs to the console. - try: - with open(log_file) as f: - print(f.read()) - except FileNotFoundError: - logger.error('Failed to find the logs for the user ' - f'program at {log_file}.') - else: - logger.info(f'\n== End of logs ({log_finish_hint}) ==') - return log_file - - @typing.overload def get_backend_from_handle( handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle' @@ -2932,30 +2711,6 @@ def get_task_resources_str(task: 'task_lib.Task') -> str: return resources_str -def check_cluster_name_not_reserved( - cluster_name: Optional[str], - operation_str: Optional[str] = None) -> None: - """Errors out if the cluster name is reserved. - - Currently, all reserved cluster names are skypilot controller, i.e. - spot controller/sky serve controller. - - Raises: - sky.exceptions.NotSupportedError: if the cluster name is reserved, raise - with an error message explaining 'operation_str' is not allowed. - - Returns: - None, if the cluster name is not reserved. - """ - controller = Controllers.check_cluster_name(cluster_name) - if controller is not None: - msg = controller.value.check_cluster_name_hint - if operation_str is not None: - msg += f' {operation_str} is not allowed.' - with ux_utils.print_exception_no_traceback(): - raise exceptions.NotSupportedError(msg) - - # Handle ctrl-c def interrupt_handler(signum, frame): del signum, frame diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 11a7e8b2e0c..61e6b04ac33 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -52,6 +52,7 @@ from sky.usage import usage_lib from sky.utils import command_runner from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import log_utils from sky.utils import resources_utils from sky.utils import rich_utils @@ -3372,8 +3373,8 @@ def _exec_code_on_head( self.tail_logs(handle, job_id) finally: name = handle.cluster_name - controller = backend_utils.Controllers.check_cluster_name(name) - if controller == backend_utils.Controllers.SPOT_CONTROLLER: + controller = controller_utils.Controllers.check_cluster_name(name) + if controller == controller_utils.Controllers.SPOT_CONTROLLER: logger.info( f'{fore.CYAN}Spot Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' @@ -3392,7 +3393,8 @@ def _exec_code_on_head( '\nTo view the spot job dashboard:\t' f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') - elif controller == backend_utils.Controllers.SKY_SERVE_CONTROLLER: + elif (controller == + controller_utils.Controllers.SKY_SERVE_CONTROLLER): sn = service_name logger.info( f'{fore.CYAN}Service name: ' @@ -3541,7 +3543,7 @@ def _post_execute(self, handle: CloudVmRayResourceHandle, fore = colorama.Fore style = colorama.Style name = handle.cluster_name - controller = backend_utils.Controllers.check_cluster_name(name) + controller = controller_utils.Controllers.check_cluster_name(name) if controller is not None or down: return stop_str = ('\nTo stop the cluster:' diff --git a/sky/backends/onprem_utils.py b/sky/backends/onprem_utils.py index b1da3f97f79..a6c84c4df91 100644 --- a/sky/backends/onprem_utils.py +++ b/sky/backends/onprem_utils.py @@ -105,9 +105,7 @@ def check_and_get_local_clusters(suppress_error: bool = False) -> List[str]: # Remove clusters that are in global user state but are not in # ~/.sky/local. records = backend_utils.get_clusters( - include_reserved=False, - refresh=False, - cloud_filter=backend_utils.CloudFilter.LOCAL) + refresh=False, cloud_filter=backend_utils.CloudFilter.LOCAL) saved_clusters = [r['name'] for r in records] for cluster_name in saved_clusters: if cluster_name not in local_cluster_names: diff --git a/sky/cli.py b/sky/cli.py index 890c1afb95e..46ad88a8cc2 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -60,12 +60,12 @@ from sky.benchmark import benchmark_state from sky.benchmark import benchmark_utils from sky.clouds import service_catalog -from sky.data import storage_utils from sky.skylet import constants from sky.skylet import job_lib from sky.usage import usage_lib from sky.utils import command_runner from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import dag_utils from sky.utils import env_options from sky.utils import kubernetes_utils @@ -1401,7 +1401,7 @@ def launch( """ # NOTE(dev): Keep the docstring consistent between the Python API and CLI. env = _merge_env_vars(env_file, env) - backend_utils.check_cluster_name_not_reserved( + controller_utils.check_cluster_name_not_reserved( cluster, operation_str='Launching tasks on it') if backend_name is None: backend_name = backends.CloudVmRayBackend.NAME @@ -1571,7 +1571,7 @@ def exec( raise ValueError('`ports` is not supported by `sky exec`.') env = _merge_env_vars(env_file, env) - backend_utils.check_cluster_name_not_reserved( + controller_utils.check_cluster_name_not_reserved( cluster, operation_str='Executing task on it') handle = global_user_state.get_handle_from_cluster_name(cluster) if handle is None: @@ -1890,7 +1890,7 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, reserved_clusters = [] for cluster_record in cluster_records: cluster_name = cluster_record['name'] - controller = backend_utils.Controllers.check_cluster_name( + controller = controller_utils.Controllers.check_cluster_name( cluster_name) if controller is not None: reserved_clusters.append(cluster_record) @@ -2024,7 +2024,8 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin reserved_clusters = dict() for cluster_record in cluster_records: cluster_name = cluster_record['name'] - controller = backend_utils.Controllers.check_cluster_name(cluster_name) + controller = controller_utils.Controllers.check_cluster_name( + cluster_name) if controller is not None: controller_name = controller.value.name # to display most recent entry for each reserved cluster @@ -2294,7 +2295,7 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) except exceptions.NotSupportedError: - controller = backend_utils.Controllers.check_cluster_name(cluster) + controller = controller_utils.Controllers.check_cluster_name(cluster) assert controller is not None, cluster click.echo(controller.value.decline_cancel_hint) sys.exit(1) @@ -2588,8 +2589,8 @@ def start( clusters = [ cluster['name'] for cluster in global_user_state.get_clusters() - if backend_utils.Controllers.check_cluster_name(cluster['name']) is - None + if controller_utils.Controllers.check_cluster_name(cluster['name']) + is None ] if not clusters: @@ -2657,7 +2658,7 @@ def start( # Checks for reserved clusters (spot controller). reserved, non_reserved = [], [] for name in to_start: - if backend_utils.Controllers.check_cluster_name(name) is not None: + if controller_utils.Controllers.check_cluster_name(name) is not None: reserved.append(name) else: non_reserved.append(name) @@ -2775,7 +2776,8 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): click.echo('Managed spot controller has already been torn down.') return - controller = backend_utils.Controllers.check_cluster_name(controller_name) + controller = controller_utils.Controllers.check_cluster_name( + controller_name) assert controller is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): @@ -2827,7 +2829,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): click.echo('Sky serve controller has already been torn down.') return - controller = backend_utils.Controllers.check_cluster_name(controller_name) + controller = controller_utils.Controllers.check_cluster_name( + controller_name) assert controller is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): @@ -2853,9 +2856,9 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): _CONTROLLER_TO_HINT_OR_RAISE = { - backend_utils.Controllers.SPOT_CONTROLLER: + controller_utils.Controllers.SPOT_CONTROLLER: (_hint_or_raise_for_down_spot_controller), - backend_utils.Controllers.SKY_SERVE_CONTROLLER: + controller_utils.Controllers.SKY_SERVE_CONTROLLER: (_hint_or_raise_for_down_sky_serve_controller), } @@ -2903,12 +2906,12 @@ def _down_or_stop_clusters( if len(names) > 0: reserved_clusters = [ name for name in names - if backend_utils.Controllers.check_cluster_name(name) is not None + if controller_utils.Controllers.check_cluster_name(name) is not None ] reserved_clusters_str = ', '.join(map(repr, reserved_clusters)) names = [ name for name in _get_glob_clusters(names) - if backend_utils.Controllers.check_cluster_name(name) is None + if controller_utils.Controllers.check_cluster_name(name) is None ] if not down: local_clusters = onprem_utils.check_and_get_local_clusters() @@ -2942,7 +2945,7 @@ def _down_or_stop_clusters( f'{operation} reserved cluster(s) ' f'{reserved_clusters_str} is currently not supported.') else: - controller = backend_utils.Controllers.check_cluster_name( + controller = controller_utils.Controllers.check_cluster_name( reserved_cluster) assert controller is not None hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] @@ -2967,8 +2970,10 @@ def _down_or_stop_clusters( # Otherwise, it would be very easy to accidentally delete a reserved # cluster. names = [ - record['name'] for record in all_clusters if - backend_utils.Controllers.check_cluster_name(record['name']) is None + record['name'] + for record in all_clusters + if controller_utils.Controllers.check_cluster_name(record['name']) + is None ] clusters = [] @@ -3580,7 +3585,7 @@ def storage(): def storage_ls(all: bool): """List storage objects managed by SkyPilot.""" storages = sky.storage_ls() - storage_table = storage_utils.format_storage_table(storages, show_all=all) + storage_table = status_utils.format_storage_table(storages, show_all=all) click.echo(storage_table) @@ -4018,8 +4023,8 @@ def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): # Cancel managed spot jobs with IDs 1, 2, 3 $ sky spot cancel 1 2 3 """ - _, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + _, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message='All managed spot jobs should have finished.') if handle is None: # Hint messages already printed by the call above. @@ -4103,8 +4108,8 @@ def spot_dashboard(port: Optional[int]): hint = ( 'Dashboard is not available if spot controller is not up. Run a spot ' 'job first.') - _, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + _, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message=hint, non_existent_message=hint) if handle is None: @@ -4182,7 +4187,7 @@ def serve_up( sky serve up service.yaml """ if service_name is None: - service_name = backend_utils.generate_service_name() + service_name = serve_lib.generate_service_name() is_yaml, _ = _check_yaml(''.join(service_yaml)) if not is_yaml: @@ -4378,8 +4383,8 @@ def serve_down(service_names: List[str], all: bool, yes: bool): 'Can only specify one of SERVICE_NAMES or --all. ' f'Provided {argument_str!r}.') - _, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + _, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='All services should have been terminated.') if handle is None: # Hint messages already printed by the call above. diff --git a/sky/core.py b/sky/core.py index e148da91725..61239d12d74 100644 --- a/sky/core.py +++ b/sky/core.py @@ -20,6 +20,7 @@ from sky.skylet import constants from sky.skylet import job_lib from sky.usage import usage_lib +from sky.utils import controller_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import tpu_utils @@ -105,8 +106,7 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None, cluster. If a cluster is found to be terminated or not found, it will be omitted from the returned list. """ - return backend_utils.get_clusters(include_reserved=True, - refresh=refresh, + return backend_utils.get_clusters(refresh=refresh, cluster_names=cluster_names) @@ -183,7 +183,8 @@ def _start( f'Starting cluster {cluster_name!r} with backend {backend.NAME} ' 'is not supported.') - if backend_utils.Controllers.check_cluster_name(cluster_name) is not None: + if controller_utils.Controllers.check_cluster_name( + cluster_name) is not None: if down: raise ValueError('Using autodown (rather than autostop) is not ' 'supported for skypilot controllers. Pass ' @@ -195,7 +196,7 @@ def _start( 'fix: omit the `idle_minutes_to_autostop` argument to use the ' f'default autostop settings (got: {idle_minutes_to_autostop}).') idle_minutes_to_autostop = ( - backend_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) + controller_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) # NOTE: if spot_queue() calls _start() and hits here, that entrypoint # would have a cluster name (the controller) filled in. @@ -299,7 +300,8 @@ def stop(cluster_name: str, purge: bool = False) -> None: sky.exceptions.NotSupportedError: if the specified cluster is a spot cluster, or a TPU VM Pod cluster, or the managed spot controller. """ - if backend_utils.Controllers.check_cluster_name(cluster_name) is not None: + if controller_utils.Controllers.check_cluster_name( + cluster_name) is not None: raise exceptions.NotSupportedError( f'Stopping sky reserved cluster {cluster_name!r} ' f'is not supported.') @@ -422,7 +424,8 @@ def autostop( if is_cancel: option_str = '{stop,down}' operation = f'{verb} auto{option_str}' - if backend_utils.Controllers.check_cluster_name(cluster_name) is not None: + if controller_utils.Controllers.check_cluster_name( + cluster_name) is not None: raise exceptions.NotSupportedError( f'{operation} sky reserved cluster {cluster_name!r} ' f'is not supported.') @@ -557,7 +560,7 @@ def cancel( sky.exceptions.CloudUserIdentityError: if we fail to get the current user identity. """ - backend_utils.check_cluster_name_not_reserved( + controller_utils.check_cluster_name_not_reserved( cluster_name, operation_str='Cancelling jobs') if all and job_ids: @@ -791,8 +794,8 @@ def spot_queue(refresh: bool, stop_msg = '' if not refresh: stop_msg = 'To view the latest job table: sky spot queue --refresh' - controller_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + controller_status, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message=stop_msg) if (refresh and controller_status in [ @@ -864,11 +867,12 @@ def spot_cancel(name: Optional[str] = None, RuntimeError: failed to cancel the job. """ job_ids = [] if job_ids is None else job_ids - cluster_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + cluster_status, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message='All managed spot jobs should have finished.') if handle is None or handle.head_ip is None: - # The error message is already printed in backend_utils.is_controller_up + # The error message is already printed in + # controller_utils.is_controller_up # TODO(zhwu): Move the error message into the exception. with ux_utils.print_exception_no_traceback(): raise exceptions.ClusterNotUpError('', @@ -923,8 +927,8 @@ def spot_tail_logs(name: Optional[str], job_id: Optional[int], sky.exceptions.ClusterNotUpError: the spot controller is not up. """ # TODO(zhwu): Automatically restart the spot controller - controller_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + controller_status, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message=('Please restart the spot controller with ' f'`sky start {spot.SPOT_CONTROLLER_NAME}`.')) if handle is None or handle.head_ip is None: @@ -1055,8 +1059,8 @@ def serve_status( # TODO(tian): This is so slow... It will take ~10s to refresh the status # of controller. Can we optimize this? - controller_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + controller_status, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='No service is found.') if handle is None or handle.head_ip is None: @@ -1112,11 +1116,12 @@ def serve_down(service_names: Optional[Union[str, List[str]]] = None, service_names = [] if isinstance(service_names, str): service_names = [service_names] - cluster_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + cluster_status, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='All services should have terminated.') if handle is None or handle.head_ip is None: - # The error message is already printed in backend_utils.is_controller_up + # The error message is already printed in + # controller_utils.is_controller_up # TODO(zhwu): Move the error message into the exception. with ux_utils.print_exception_no_traceback(): raise exceptions.ClusterNotUpError('', @@ -1215,8 +1220,8 @@ def serve_tail_logs( with ux_utils.print_exception_no_traceback(): raise ValueError('`replica_id` must be None when using ' 'target=CONTROLLER/LOAD_BALANCER.') - controller_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + controller_status, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='No service is found.') if handle is None or handle.head_ip is None: msg = 'No service is found.' diff --git a/sky/data/storage_utils.py b/sky/data/storage_utils.py index 044e00f5aeb..916af13a77b 100644 --- a/sky/data/storage_utils.py +++ b/sky/data/storage_utils.py @@ -1,14 +1,12 @@ """Utility functions for the storage module.""" import os import subprocess -from typing import Any, Dict, List +from typing import List import colorama from sky import exceptions from sky import sky_logging -from sky.utils import log_utils -from sky.utils.cli_utils import status_utils logger = sky_logging.init_logger(__name__) @@ -19,49 +17,6 @@ 'due to the following error: {error_msg!r}') -def format_storage_table(storages: List[Dict[str, Any]], - show_all: bool = False) -> str: - """Format the storage table for display. - - Args: - storage_table (dict): The storage table. - - Returns: - str: The formatted storage table. - """ - storage_table = log_utils.create_table([ - 'NAME', - 'UPDATED', - 'STORE', - 'COMMAND', - 'STATUS', - ]) - - for row in storages: - launched_at = row['launched_at'] - if show_all: - command = row['last_use'] - else: - command = status_utils.truncate_long_string( - row['last_use'], status_utils.COMMAND_TRUNC_LENGTH) - storage_table.add_row([ - # NAME - row['name'], - # LAUNCHED - log_utils.readable_time_duration(launched_at), - # CLOUDS - ', '.join([s.value for s in row['store']]), - # COMMAND, - command, - # STATUS - row['status'].value, - ]) - if storages: - return str(storage_table) - else: - return 'No existing storage.' - - def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]: """ Lists files and patterns ignored by git in the source directory diff --git a/sky/execution.py b/sky/execution.py index 7438863ac80..997000bc83f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -32,6 +32,7 @@ from sky.skylet import constants from sky.usage import usage_lib from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import dag_utils from sky.utils import env_options from sky.utils import rich_utils @@ -370,7 +371,8 @@ def _execute( backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: - controller = backend_utils.Controllers.check_cluster_name(cluster_name) + controller = controller_utils.Controllers.check_cluster_name( + cluster_name) if controller is None and not _is_launched_by_sky_serve_controller: # UX: print live clusters to make users aware (to save costs). # @@ -492,8 +494,8 @@ def launch( Other exceptions may be raised depending on the backend. """ entrypoint = task - backend_utils.check_cluster_name_not_reserved(cluster_name, - operation_str='sky.launch') + controller_utils.check_cluster_name_not_reserved(cluster_name, + operation_str='sky.launch') _execute( entrypoint=entrypoint, @@ -577,8 +579,8 @@ def exec( # pylint: disable=redefined-builtin f'{colorama.Fore.YELLOW}Passing a sky.Dag to sky.exec() is ' 'deprecated. Pass sky.Task instead.' f'{colorama.Style.RESET_ALL}') - backend_utils.check_cluster_name_not_reserved(cluster_name, - operation_str='sky.exec') + controller_utils.check_cluster_name_not_reserved(cluster_name, + operation_str='sky.exec') handle = backend_utils.check_cluster_available( cluster_name, @@ -800,7 +802,7 @@ def spot_launch( stream_logs=stream_logs, cluster_name=controller_name, detach_run=detach_run, - idle_minutes_to_autostop=backend_utils. + idle_minutes_to_autostop=controller_utils. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) @@ -1017,8 +1019,9 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: with rich_utils.safe_status( '[cyan]Registering service on the controller[/]'): with sky_logging.silent(): - status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + status, handle = controller_utils.is_controller_up( + controller_type=controller_utils.Controllers. + SKY_SERVE_CONTROLLER, stopped_message='') if handle is None or handle.head_ip is None: # The sky serve controller is STOPPED, or it is the first time @@ -1099,7 +1102,7 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: stream_logs=False, cluster_name=controller_name, detach_run=True, - idle_minutes_to_autostop=backend_utils. + idle_minutes_to_autostop=controller_utils. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) @@ -1119,7 +1122,7 @@ def serve_up( service_name: Name of the service. """ if service_name is None: - service_name = backend_utils.generate_service_name() + service_name = serve.generate_service_name() # The service name will be used as: # 1. controller cluster name: 'sky-serve-controller-' diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index b1705919582..a9ce1e82fbe 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -14,6 +14,7 @@ from sky.serve.serve_utils import generate_remote_controller_log_file_name from sky.serve.serve_utils import generate_remote_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name +from sky.serve.serve_utils import generate_service_name from sky.serve.serve_utils import load_add_service_result from sky.serve.serve_utils import load_serve_status from sky.serve.serve_utils import ServeCodeGen diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 29ac70ad765..576edd212d2 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -19,7 +19,6 @@ from sky import global_user_state from sky import sky_logging from sky import status_lib -from sky.backends import backend_utils from sky.serve import constants as serve_constants from sky.serve import serve_state from sky.serve import serve_utils @@ -27,6 +26,7 @@ from sky.skylet import job_lib from sky.usage import usage_lib from sky.utils import common_utils +from sky.utils import controller_utils if typing.TYPE_CHECKING: from sky.serve import service_spec @@ -497,13 +497,10 @@ def _download_and_stream_logs(info: ReplicaInfo): assert isinstance(handle, backends.CloudVmRayResourceHandle) replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, 'replica_jobs') - log_file = backend_utils.download_and_stream_latest_job_log( - backend, - handle, - replica_job_logs_dir, - log_position_hint='replica cluster', - log_finish_hint=f'Replica: {replica_id}') + log_file = controller_utils.download_and_stream_latest_job_log( + backend, handle, replica_job_logs_dir) if log_file is not None: + logger.info(f'\n== End of logs (Replica: {replica_id}) ==') with open(local_log_file_name, 'a') as local_file, open(log_file, 'r') as job_file: local_file.write(job_file.read()) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 42a6a0a6476..088b83b91ef 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -12,6 +12,7 @@ import typing from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, TextIO, Type, TypeVar) +import uuid import colorama import filelock @@ -29,7 +30,7 @@ if typing.TYPE_CHECKING: import fastapi -SKY_SERVE_CONTROLLER_NAME = ( +SKY_SERVE_CONTROLLER_NAME: str = ( f'sky-serve-controller-{common_utils.get_user_hash()}') _SYSTEM_MEMORY_GB = psutil.virtual_memory().total // (1024**3) NUM_SERVICE_THRESHOLD = _SYSTEM_MEMORY_GB // constants.SERVICES_MEMORY_USAGE_GB @@ -190,6 +191,10 @@ def run(self, *args, **kwargs): raise +def generate_service_name(): + return f'sky-service-{uuid.uuid4().hex[:4]}' + + def generate_remote_service_dir_name(service_name: str) -> str: service_name = service_name.replace('-', '_') return os.path.join(constants.SKYSERVE_METADATA_DIR, service_name) diff --git a/sky/spot/controller.py b/sky/spot/controller.py index eefb200e09d..e1ad42f7fc1 100644 --- a/sky/spot/controller.py +++ b/sky/spot/controller.py @@ -22,6 +22,7 @@ from sky.spot import spot_utils from sky.usage import usage_lib from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import dag_utils from sky.utils import subprocess_utils from sky.utils import ux_utils @@ -87,12 +88,9 @@ def _download_log_and_stream( """ spot_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, 'spot_jobs') - backend_utils.download_and_stream_latest_job_log( - self._backend, - handle, - spot_job_logs_dir, - log_position_hint='spot cluster', - log_finish_hint=f'ID: {self._job_id}') + controller_utils.download_and_stream_latest_job_log( + self._backend, handle, spot_job_logs_dir) + logger.info(f'\n== End of logs (ID: {self._job_id}) ==') def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool: """Busy loop monitoring spot cluster status and handling recovery. diff --git a/sky/spot/spot_utils.py b/sky/spot/spot_utils.py index 3b252374a28..e7e4906a42f 100644 --- a/sky/spot/spot_utils.py +++ b/sky/spot/spot_utils.py @@ -36,7 +36,8 @@ # Add user hash so that two users don't have the same controller VM on # shared-account clouds such as GCP. -SPOT_CONTROLLER_NAME = f'sky-spot-controller-{common_utils.get_user_hash()}' +SPOT_CONTROLLER_NAME: str = ( + f'sky-spot-controller-{common_utils.get_user_hash()}') SIGNAL_FILE_PREFIX = '/tmp/sky_spot_controller_signal_{}' # Controller checks its job's status every this many seconds. JOB_STATUS_CHECK_GAP_SECONDS = 20 diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index ff8ce9ee69c..762ecfa7b77 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -11,6 +11,7 @@ from sky import status_lib from sky.backends import backend_utils from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import log_utils COMMAND_TRUNC_LENGTH = 25 @@ -192,6 +193,49 @@ def format_replica_table(replica_records: List[_ReplicaRecord], return f'{replica_table}{truncate_hint}' +def format_storage_table(storages: List[Dict[str, Any]], + show_all: bool = False) -> str: + """Format the storage table for display. + + Args: + storage_table (dict): The storage table. + + Returns: + str: The formatted storage table. + """ + storage_table = log_utils.create_table([ + 'NAME', + 'UPDATED', + 'STORE', + 'COMMAND', + 'STATUS', + ]) + + for row in storages: + launched_at = row['launched_at'] + if show_all: + command = row['last_use'] + else: + command = truncate_long_string(row['last_use'], + COMMAND_TRUNC_LENGTH) + storage_table.add_row([ + # NAME + row['name'], + # LAUNCHED + log_utils.readable_time_duration(launched_at), + # CLOUDS + ', '.join([s.value for s in row['store']]), + # COMMAND, + command, + # STATUS + row['status'].value, + ]) + if storages: + return str(storage_table) + else: + return 'No existing storage.' + + def get_total_cost_of_displayed_records( cluster_records: List[_ClusterCostReportRecord], display_all: bool): """Compute total cost of records to be displayed in cost report.""" @@ -272,7 +316,8 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord], if cluster_records: if reserved_group_name is not None: - autostop_minutes = backend_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP + autostop_minutes = ( + controller_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'{reserved_group_name}{colorama.Style.RESET_ALL}' f'{colorama.Style.DIM} (will be autostopped if idle for ' @@ -295,10 +340,8 @@ def show_local_status_table(local_clusters: List[str]): `sky launch`. Sky understands what types of resources are on the nodes and has ran at least one job on the cluster. """ - clusters_status = backend_utils.get_clusters( - include_reserved=False, - refresh=False, - cloud_filter=backend_utils.CloudFilter.LOCAL) + clusters_status = controller_utils.get_non_reserved_clusters( + refresh=False, cloud_filter=backend_utils.CloudFilter.LOCAL) columns = [ 'NAME', 'USER', @@ -419,7 +462,7 @@ def _get_replicas(service_record: _ServiceRecord) -> str: def get_endpoint(service_record: _ServiceRecord) -> str: - # Don't use backend_utils.is_controller_up since it is too slow. + # Don't use controller_utils.is_controller_up since it is too slow. handle = global_user_state.get_handle_from_cluster_name( serve.SKY_SERVE_CONTROLLER_NAME) assert isinstance(handle, backends.CloudVmRayResourceHandle) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py new file mode 100644 index 00000000000..1f902b57172 --- /dev/null +++ b/sky/utils/controller_utils.py @@ -0,0 +1,268 @@ +"""Util constants/functions for SkyPilot Controllers.""" +import dataclasses +import enum +import os +import typing +from typing import Any, Dict, List, Optional, Tuple, Union + +import colorama + +from sky import exceptions +from sky import global_user_state +from sky import serve +from sky import sky_logging +from sky import spot +from sky import status_lib +from sky.backends import backend_utils +from sky.utils import common_utils +from sky.utils import ux_utils + +if typing.TYPE_CHECKING: + from sky import backends + from sky.backends import cloud_vm_ray_backend + +logger = sky_logging.init_logger(__name__) + +# The default idle timeout for skypilot controllers. This include spot +# controller and sky serve controller. +CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 + + +@dataclasses.dataclass +class _ControllerSpec: + """Spec for skypilot controllers.""" + name: str + cluster_name: str + sky_status_hint: str + decline_cancel_hint: str + decline_down_in_init_status_hint: str + decline_down_for_dirty_controller_hint: str + check_cluster_name_hint: str + default_hint_if_non_existent: str + + +class Controllers(enum.Enum): + """Skypilot controllers.""" + # NOTE(dev): Keep this align with + # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE + SPOT_CONTROLLER = _ControllerSpec( + name='managed spot controller', + cluster_name=spot.SPOT_CONTROLLER_NAME, + sky_status_hint=( + f'* To see detailed spot job status: {colorama.Style.BRIGHT}' + f'sky spot queue{colorama.Style.RESET_ALL}'), + decline_cancel_hint=( + 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' + f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel [--all]{colorama.Style.RESET_ALL}'), + decline_down_in_init_status_hint=( + f'{colorama.Fore.RED}Tearing down the spot controller while ' + 'it is in INIT state is not supported (this means a spot launch ' + 'is in progress or the previous launch failed), as we cannot ' + 'guarantee that all the spot jobs are finished. Please wait ' + 'until the spot controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{spot.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), + decline_down_for_dirty_controller_hint=( + f'{colorama.Fore.RED}In-progress spot jobs found. To avoid ' + f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}' + f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), + check_cluster_name_hint=( + f'Cluster {spot.SPOT_CONTROLLER_NAME} is reserved for ' + 'managed spot controller. '), + default_hint_if_non_existent='No managed spot jobs are found.') + SKY_SERVE_CONTROLLER = _ControllerSpec( + name='sky serve controller', + cluster_name=serve.SKY_SERVE_CONTROLLER_NAME, + sky_status_hint=( + f'* To see detailed service status: {colorama.Style.BRIGHT}' + f'sky serve status{colorama.Style.RESET_ALL}'), + decline_cancel_hint=( + 'Cancelling the sky serve controller\'s jobs is not allowed.'), + decline_down_in_init_status_hint=( + f'{colorama.Fore.RED}Tearing down the sky serve controller ' + 'while it is in INIT state is not supported (this means a sky ' + 'serve up is in progress or the previous launch failed), as we ' + 'cannot guarantee that all the services are terminated. Please ' + 'wait until the sky serve controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{serve.SKY_SERVE_CONTROLLER_NAME}' + f'{colorama.Style.RESET_ALL}.'), + decline_down_for_dirty_controller_hint=( + f'{colorama.Fore.RED}Tearing down the sky serve controller is not ' + 'supported, as it is currently serving the following services: ' + '{service_names}. Please terminate the services first with ' + f'{colorama.Style.BRIGHT}sky serve down -a' + f'{colorama.Style.RESET_ALL}.'), + check_cluster_name_hint=( + f'Cluster {serve.SKY_SERVE_CONTROLLER_NAME} is reserved for ' + 'sky serve controller. '), + default_hint_if_non_existent='No service is found.') + + @classmethod + def check_cluster_name(cls, name: Optional[str]) -> Optional['Controllers']: + """Check if the cluster name is a controller name. + + Returns: + The controller if the cluster name is a controller name. + Otherwise, returns None. + """ + for controller in cls: + if controller.value.cluster_name == name: + return controller + return None + + +def is_controller_up( + controller_type: Controllers, + stopped_message: str, + non_existent_message: Optional[str] = None, +) -> Tuple[Optional[status_lib.ClusterStatus], + Optional['backends.CloudVmRayResourceHandle']]: + """Check if the spot/serve controller is up. + + It can be used to check the actual controller status (since the autostop is + set for the controller) before the spot/serve commands interact with the + controller. + + Args: + type: Type of the controller. + stopped_message: Message to print if the controller is STOPPED. + non_existent_message: Message to show if the controller does not exist. + + Returns: + controller_status: The status of the controller. If it fails during + refreshing the status, it will be the cached status. None if the + controller does not exist. + handle: The ResourceHandle of the controller. None if the + controller is not UP or does not exist. + + Raises: + exceptions.ClusterOwnerIdentityMismatchError: if the current user is not + the same as the user who created the cluster. + exceptions.CloudUserIdentityError: if we fail to get the current user + identity. + """ + if non_existent_message is None: + non_existent_message = ( + controller_type.value.default_hint_if_non_existent) + cluster_name = controller_type.value.cluster_name + controller_name = controller_type.value.name.replace(' controller', '') + try: + # Set force_refresh_statuses=None to make sure the refresh only happens + # when the controller is INIT/UP (triggered in these statuses as the + # autostop is always set for the controller). This optimization avoids + # unnecessary costly refresh when the controller is already stopped. + # This optimization is based on the assumption that the user will not + # start the controller manually from the cloud console. + controller_status, handle = backend_utils.refresh_cluster_status_handle( + cluster_name, force_refresh_statuses=None) + except exceptions.ClusterStatusFetchingError as e: + # We do not catch the exceptions related to the cluster owner identity + # mismatch, please refer to the comment in + # `backend_utils.check_cluster_available`. + logger.warning( + 'Failed to get the status of the controller. It is not ' + f'fatal, but {controller_name} commands/calls may hang or return ' + 'stale information, when the controller is not up.\n' + f' Details: {common_utils.format_exception(e, use_bracket=True)}') + record = global_user_state.get_cluster_from_name(cluster_name) + controller_status, handle = None, None + if record is not None: + controller_status, handle = record['status'], record['handle'] + + if controller_status is None: + sky_logging.print(non_existent_message) + elif controller_status != status_lib.ClusterStatus.UP: + msg = (f'{controller_name.capitalize()} controller {cluster_name} ' + f'is {controller_status.value}.') + if controller_status == status_lib.ClusterStatus.STOPPED: + msg += f'\n{stopped_message}' + if controller_status == status_lib.ClusterStatus.INIT: + msg += '\nPlease wait for the controller to be ready.' + sky_logging.print(msg) + handle = None + return controller_status, handle + + +# Internal only: +def download_and_stream_latest_job_log( + backend: 'cloud_vm_ray_backend.CloudVmRayBackend', + handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle', + local_dir: str) -> Optional[str]: + """Downloads and streams the latest job log. + + This function is only used by spot controller and sky serve controller. + """ + os.makedirs(local_dir, exist_ok=True) + log_file = None + try: + log_dirs = backend.sync_down_logs( + handle, + # Download the log of the latest job. + # The job_id for the spot job running on the spot cluster is not + # necessarily 1, as it is possible that the worker node in a + # multi-node cluster is preempted, and we recover the spot job + # on the existing cluster, which leads to a larger job_id. Those + # job_ids all represent the same logical spot job. + job_ids=None, + local_dir=local_dir) + except exceptions.CommandError as e: + logger.info(f'Failed to download the logs: ' + f'{common_utils.format_exception(e)}') + else: + if not log_dirs: + logger.error('Failed to find the logs for the user program.') + else: + log_dir = list(log_dirs.values())[0] + log_file = os.path.join(log_dir, 'run.log') + + # Print the logs to the console. + try: + with open(log_file) as f: + print(f.read()) + except FileNotFoundError: + logger.error('Failed to find the logs for the user ' + f'program at {log_file}.') + return log_file + + +def get_non_reserved_clusters( + refresh: bool, + cloud_filter: backend_utils.CloudFilter = backend_utils.CloudFilter. + CLOUDS_AND_DOCKER, + cluster_names: Optional[Union[str, List[str]]] = None, +) -> List[Dict[str, Any]]: + """Wrapper for the backend_utils.get_clusters without reserved clusters.""" + records = backend_utils.get_clusters(refresh=refresh, + cloud_filter=cloud_filter, + cluster_names=cluster_names) + records = [ + record for record in records + if Controllers.check_cluster_name(record['name']) is None + ] + return records + + +def check_cluster_name_not_reserved( + cluster_name: Optional[str], + operation_str: Optional[str] = None) -> None: + """Errors out if the cluster name is reserved. + + Currently, all reserved cluster names are skypilot controller, i.e. + spot controller/sky serve controller. + + Raises: + sky.exceptions.NotSupportedError: if the cluster name is reserved, raise + with an error message explaining 'operation_str' is not allowed. + + Returns: + None, if the cluster name is not reserved. + """ + controller = Controllers.check_cluster_name(cluster_name) + if controller is not None: + msg = controller.value.check_cluster_name_hint + if operation_str is not None: + msg += f' {operation_str} is not allowed.' + with ux_utils.print_exception_no_traceback(): + raise exceptions.NotSupportedError(msg) From a5216152271c0ae8087deaecc2beaabc7337ee2f Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 11:51:15 -0800 Subject: [PATCH 177/223] apply suggestion from code review --- sky/cli.py | 57 +++++++++++++++-------------- sky/core.py | 8 ++-- sky/execution.py | 15 ++++---- sky/serve/autoscalers.py | 39 ++++++++++++++------ sky/serve/controller.py | 13 ++++--- sky/serve/replica_managers.py | 5 +-- sky/serve/serve_utils.py | 29 ++++----------- sky/utils/cli_utils/status_utils.py | 6 +-- 8 files changed, 88 insertions(+), 84 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 46ad88a8cc2..1e294eb9ec5 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1907,29 +1907,29 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, def _try_get_future_result(future) -> Tuple[bool, Any]: result = None - success = True + interrupted = False try: result = future.get() except KeyboardInterrupt: pool.terminate() - success = False - return success, result + interrupted = True + return interrupted, result - spot_jobs_success = True + spot_jobs_query_interrupted = False if show_spot_jobs: click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Managed spot jobs{colorama.Style.RESET_ALL}') with rich_utils.safe_status('[cyan]Checking spot jobs[/]'): - spot_jobs_success, result = _try_get_future_result( + spot_jobs_query_interrupted, result = _try_get_future_result( spot_jobs_future) - if spot_jobs_success: - num_in_progress_jobs, msg = result - else: + if spot_jobs_query_interrupted: # Set to -1, so that the controller is not considered # down, and the hint for showing sky spot queue # will still be shown. num_in_progress_jobs = -1 msg = 'KeyboardInterrupt' + else: + num_in_progress_jobs, msg = result click.echo(msg) if num_in_progress_jobs is not None: @@ -1954,16 +1954,15 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: if show_services: click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Services{colorama.Style.RESET_ALL}') - if not spot_jobs_success: + if spot_jobs_query_interrupted: # The pool is terminated, so we cannot run the service query. - click.secho('Failed to query services. Please try again later.', - fg='yellow') + msg = 'KeyboardInterrupt' else: with rich_utils.safe_status('[cyan]Checking services[/]'): - success, msg = _try_get_future_result(services_future) - if not success: + interrupted, msg = _try_get_future_result(services_future) + if interrupted: msg = 'KeyboardInterrupt' - click.echo(msg) + click.echo(msg) if show_spot_jobs or show_services: try: @@ -2021,7 +2020,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin cluster_records = core.cost_report() nonreserved_cluster_records = [] - reserved_clusters = dict() + controllers = dict() for cluster_record in cluster_records: cluster_name = cluster_record['name'] controller = controller_utils.Controllers.check_cluster_name( @@ -2030,8 +2029,8 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin controller_name = controller.value.name # to display most recent entry for each reserved cluster # TODO(sgurram): fix assumption of sorted order of clusters - if controller_name not in reserved_clusters: - reserved_clusters[controller_name] = cluster_record + if controller_name not in controllers: + controllers[controller_name] = cluster_record else: nonreserved_cluster_records.append(cluster_record) @@ -2039,11 +2038,9 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin nonreserved_cluster_records, all) status_utils.show_cost_report_table(nonreserved_cluster_records, all) - for controller_name, cluster_record in reserved_clusters.items(): + for controller_name, cluster_record in controllers.items(): status_utils.show_cost_report_table( - [cluster_record], - all, - reserved_group_name=controller_name.capitalize()) + [cluster_record], all, controller_name=controller_name.capitalize()) total_cost += cluster_record['total_cost'] click.echo(f'\n{colorama.Style.BRIGHT}' @@ -2655,26 +2652,30 @@ def start( if not to_start: return - # Checks for reserved clusters (spot controller). - reserved, non_reserved = [], [] + # Checks for controller clusters (spot controller / sky serve controller). + controllers, normal_clusters = [], [] for name in to_start: if controller_utils.Controllers.check_cluster_name(name) is not None: - reserved.append(name) + controllers.append(name) else: - non_reserved.append(name) - if reserved and non_reserved: + normal_clusters.append(name) + if controllers and normal_clusters: # Keep this behavior the same as _down_or_stop_clusters(). raise click.UsageError('Starting controllers with other cluster(s) ' 'is currently not supported.\n' 'Please start the former independently.') - if reserved: + if controllers: bold = backend_utils.BOLD reset_bold = backend_utils.RESET_BOLD + if len(controllers) != 1: + raise click.UsageError( + 'Starting multiple controllers is currently not supported.\n' + 'Please start them independently.') if idle_minutes_to_autostop is not None: raise click.UsageError( 'Autostop options are currently not allowed when starting the ' 'controllers. Use the default autostop settings by directly ' - f'calling: {bold}sky start {" ".join(reserved)}{reset_bold}') + f'calling: {bold}sky start {" ".join(controllers)}{reset_bold}') if not yes: cluster_str = 'clusters' if len(to_start) > 1 else 'cluster' diff --git a/sky/core.py b/sky/core.py index 61239d12d74..e94b26614ed 100644 --- a/sky/core.py +++ b/sky/core.py @@ -187,12 +187,12 @@ def _start( cluster_name) is not None: if down: raise ValueError('Using autodown (rather than autostop) is not ' - 'supported for skypilot controllers. Pass ' + 'supported for SkyPilot controllers. Pass ' '`down=False` or omit it instead.') if idle_minutes_to_autostop is not None: raise ValueError( 'Passing a custom autostop setting is currently not ' - 'supported when starting skypilot controllers. To ' + 'supported when starting SkyPilot controllers. To ' 'fix: omit the `idle_minutes_to_autostop` argument to use the ' f'default autostop settings (got: {idle_minutes_to_autostop}).') idle_minutes_to_autostop = ( @@ -875,7 +875,7 @@ def spot_cancel(name: Optional[str] = None, # controller_utils.is_controller_up # TODO(zhwu): Move the error message into the exception. with ux_utils.print_exception_no_traceback(): - raise exceptions.ClusterNotUpError('', + raise exceptions.ClusterNotUpError(message='', cluster_status=cluster_status) job_id_str = ','.join(map(str, job_ids)) @@ -1124,7 +1124,7 @@ def serve_down(service_names: Optional[Union[str, List[str]]] = None, # controller_utils.is_controller_up # TODO(zhwu): Move the error message into the exception. with ux_utils.print_exception_no_traceback(): - raise exceptions.ClusterNotUpError('', + raise exceptions.ClusterNotUpError(message='', cluster_status=cluster_status) service_names_str = ','.join(service_names) diff --git a/sky/execution.py b/sky/execution.py index 997000bc83f..8cfa417f6e7 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -601,9 +601,9 @@ def exec( # pylint: disable=redefined-builtin detach_run=detach_run) -def _shared_controller_env_vars() -> Dict[str, Any]: - env_vars: Dict[str, Any] = { - env.value: 1 for env in env_options.Options if env.get() +def _shared_controller_env_vars() -> Dict[str, str]: + env_vars: Dict[str, str] = { + env.value: '1' for env in env_options.Options if env.get() } env_vars.update({ # Should not use $USER here, as that env var can be empty when @@ -611,7 +611,7 @@ def _shared_controller_env_vars() -> Dict[str, Any]: constants.USER_ENV_VAR: getpass.getuser(), constants.USER_ID_ENV_VAR: common_utils.get_user_hash(), # Skip cloud identity check to avoid the overhead. - env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: 1, + env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: '1', }) return env_vars @@ -1038,9 +1038,10 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: with ux_utils.print_exception_no_traceback(): raise RuntimeError( f'The service {service_name!r} is already running. ' - 'Updating a service will be supported in the future. ' - 'For now, `sky serve down` and then `sky serve up` ' - 'again.') + 'Please specify a different name for your service. ' + 'To update an existing service, run: `sky serve down` ' + 'and then `sky serve up` again (in-place update will ' + 'be supported in the future).') _maybe_translate_local_file_mounts_and_sync_up(task, path='serve') diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 1ea58583841..36ceb6223f6 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -4,7 +4,7 @@ import enum import time import typing -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional, Union from sky import sky_logging from sky.serve import constants @@ -12,6 +12,7 @@ from sky.serve import serve_utils if typing.TYPE_CHECKING: + from sky.serve import replica_managers from sky.serve import service_spec logger = sky_logging.init_logger(__name__) @@ -19,7 +20,7 @@ # Since sky.launch is very resource demanding, we limit the number of # concurrent sky.launch process to avoid overloading the machine. # TODO(tian): determine this value based on controller resources. -_MAX_BOOTSTRAPPING_NUM = 5 +_MAX_NUM_LAUNCH = 5 class AutoscalerDecisionOperator(enum.Enum): @@ -30,6 +31,18 @@ class AutoscalerDecisionOperator(enum.Enum): @dataclasses.dataclass class AutoscalerDecision: + """Autoscaling decisions. + + |---------------------------------------------------------| + | Operator | TargetType | Meaning | + |------------|------------|-------------------------------| + | SCALE_UP | int | Number of replicas to add | + |------------|------------|-------------------------------| + | SCALE_DOWN | List[int] | List of replica ids to remove | + |------------|------------|-------------------------------| + | NO_OP | None | No scaling needed | + |---------------------------------------------------------| + """ operator: AutoscalerDecisionOperator target: Optional[Union[int, List[int]]] @@ -63,8 +76,10 @@ def collect_request_information( """Collect request information from aggregator for autoscaling.""" raise NotImplementedError - def evaluate_scaling(self, infos: List[Dict[str, - Any]]) -> AutoscalerDecision: + def evaluate_scaling( + self, + replica_infos: List['replica_managers.ReplicaInfo'], + ) -> AutoscalerDecision: """Evaluate autoscale options based on replica information.""" raise NotImplementedError @@ -111,10 +126,10 @@ def collect_request_information( def evaluate_scaling( self, - infos: List[Dict[str, Any]], + replica_infos: List['replica_managers.ReplicaInfo'], ) -> AutoscalerDecision: current_time = time.time() - num_replicas = len(infos) + num_replicas = len(replica_infos) # Check if cooldown period has passed since the last scaling operation. # Only cooldown if bootstrapping is done. @@ -146,7 +161,7 @@ def evaluate_scaling( return AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP, target=min( self.min_replicas - num_replicas, - _MAX_BOOTSTRAPPING_NUM)) + _MAX_NUM_LAUNCH)) if (self.upper_threshold is not None and requests_per_replica > self.upper_threshold): if num_replicas < self.max_replicas: @@ -178,16 +193,16 @@ def evaluate_scaling( self.last_scale_operation = current_time # Remove FAILED replicas first. replica_ids_to_remove: List[int] = [] - for i in infos: + for info in replica_infos: if len(replica_ids_to_remove) >= num_replicas_to_remove: break - if i['status'] == serve_state.ReplicaStatus.FAILED: - replica_ids_to_remove.append(i['replica_id']) + if info.status == serve_state.ReplicaStatus.FAILED: + replica_ids_to_remove.append(info.replica_id) # Then rest of them. - for i in infos: + for info in replica_infos: if len(replica_ids_to_remove) >= num_replicas_to_remove: break - replica_ids_to_remove.append(i['replica_id']) + replica_ids_to_remove.append(info.replica_id) return AutoscalerDecision( AutoscalerDecisionOperator.SCALE_DOWN, target=replica_ids_to_remove) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index d039524f162..0e963a396f8 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -16,7 +16,7 @@ from sky.serve import autoscalers from sky.serve import constants from sky.serve import replica_managers -from sky.serve import serve_utils +from sky.serve import serve_state from sky.utils import env_options logger = sky_logging.init_logger(__name__) @@ -57,10 +57,13 @@ def _run_autoscaler(self): logger.info('Starting autoscaler.') while True: try: - replica_info = serve_utils.get_replica_info( - self._service_name, - with_handle=env_options.Options.SHOW_DEBUG_INFO.get()) - logger.info(f'All replica info: {replica_info}') + replica_info = serve_state.get_replica_infos(self._service_name) + replica_info_dicts = [ + info.to_info_dict( + with_handle=env_options.Options.SHOW_DEBUG_INFO.get()) + for info in replica_info + ] + logger.info(f'All replica info: {replica_info_dicts}') scaling_option = self._autoscaler.evaluate_scaling(replica_info) if (scaling_option.operator == autoscalers.AutoscalerDecisionOperator.SCALE_UP): diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 576edd212d2..3e7996a0e55 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -809,9 +809,8 @@ def _replica_prober(self) -> None: try: self._probe_all_replicas() replica_statuses = [ - info['status'] - for info in serve_utils.get_replica_info(self._service_name, - with_handle=False) + info.status for info in serve_state.get_replica_infos( + self._service_name) ] serve_utils.set_service_status_from_replica_statuses( self._service_name, replica_statuses) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 088b83b91ef..3e4202e161d 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -294,23 +294,6 @@ def load_add_service_result(payload: str) -> bool: return common_utils.decode_payload(payload) -def get_replica_info(service_name: str, - with_handle: bool) -> List[Dict[str, Any]]: - """Get the information of all replicas of the service. - - Args: - service_name: The name of the service. - with_handle: Whether to include the handle of the replica. - - Returns: - A list of dictionaries of replica information. - """ - return [ - info.to_info_dict(with_handle=with_handle) - for info in serve_state.get_replica_infos(service_name) - ] - - def get_serve_status(service_name: str, with_replica_info: bool = True) -> Dict[str, Any]: """Get the status dict of the service. @@ -326,8 +309,10 @@ def get_serve_status(service_name: str, if record is None: raise ValueError(f'Service {service_name!r} does not exist.') if with_replica_info: - record['replica_info'] = get_replica_info(service_name, - with_handle=True) + record['replica_info'] = [ + info.to_info_dict(with_handle=True) + for info in serve_state.get_replica_infos(service_name) + ] return record @@ -496,10 +481,10 @@ def stream_replica_logs(service_name: str, f'{colorama.Style.RESET_ALL}') def _get_replica_status() -> serve_state.ReplicaStatus: - replica_info = get_replica_info(service_name, with_handle=False) + replica_info = serve_state.get_replica_infos(service_name) for info in replica_info: - if info['replica_id'] == replica_id: - return info['status'] + if info.replica_id == replica_id: + return info.status raise ValueError( _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)) diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 762ecfa7b77..8ddfa875e9c 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -252,7 +252,7 @@ def get_total_cost_of_displayed_records( def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord], show_all: bool, - reserved_group_name: Optional[str] = None): + controller_name: Optional[str] = None): """Compute cluster table values and display for cost report. For each cluster, this shows: cluster name, resources, launched time, @@ -315,11 +315,11 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord], cluster_table.add_row(row) if cluster_records: - if reserved_group_name is not None: + if controller_name is not None: autostop_minutes = ( controller_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'{reserved_group_name}{colorama.Style.RESET_ALL}' + f'{controller_name}{colorama.Style.RESET_ALL}' f'{colorama.Style.DIM} (will be autostopped if idle for ' f'{autostop_minutes}min)' f'{colorama.Style.RESET_ALL}') From 191d2ff1dc24a8c4b998a760474d3e630089cdab Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Thu, 9 Nov 2023 11:51:48 -0800 Subject: [PATCH 178/223] Update sky/exceptions.py Co-authored-by: Zhanghao Wu --- sky/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/exceptions.py b/sky/exceptions.py index 2eda6144c7c..1db0b583a52 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -253,5 +253,5 @@ def __init__(self, region: str, class ServeUserTerminatedError(Exception): - """Raised when a user tear down the service.""" + """Raised by serve controller when a user tear down the service.""" pass From 61774bcd79332fe173d11e73043e9d70230aca5f Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Thu, 9 Nov 2023 11:52:39 -0800 Subject: [PATCH 179/223] Update sky/serve/replica_managers.py Co-authored-by: Zhanghao Wu --- sky/serve/replica_managers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 3e7996a0e55..1b4080e052c 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -92,7 +92,8 @@ def launch_cluster(task_yaml_path: str, except Exception as e: # pylint: disable=broad-except logger.info('Failed to launch the sky serve replica cluster with ' f'error: {common_utils.format_exception(e)})') - logger.info(f' Traceback: {traceback.format_exc()}') + with ux_utils.enable_traceback(): + logger.info(f' Traceback: {traceback.format_exc()}') else: # No exception, the launch succeeds. return From e38fc1729ef540761fa3d2af31545a21c75368bd Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 13:53:16 -0800 Subject: [PATCH 180/223] import --- sky/serve/replica_managers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 1b4080e052c..0cf63898a10 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -27,6 +27,7 @@ from sky.usage import usage_lib from sky.utils import common_utils from sky.utils import controller_utils +from sky.utils import ux_utils if typing.TYPE_CHECKING: from sky.serve import service_spec From 01bbac74b24f24a11330adbb6e3cfa9c911e858f Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Thu, 9 Nov 2023 13:53:43 -0800 Subject: [PATCH 181/223] Update sky/serve/autoscalers.py Co-authored-by: Zhanghao Wu --- sky/serve/autoscalers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 36ceb6223f6..71cff94ff2d 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -166,8 +166,8 @@ def evaluate_scaling( requests_per_replica > self.upper_threshold): if num_replicas < self.max_replicas: scale_target = requests_per_replica / self.upper_threshold - num_replicas_to_add = max(int(scale_target * num_replicas), - self.max_replicas) - num_replicas + num_replicas_to_add = min(max(int(scale_target * num_replicas), + self.min_replicas), self.max_replicas) - num_replicas if num_replicas_to_add > 0: plural = 's' if num_replicas_to_add > 1 else '' logger.info('Requests per replica is above upper threshold ' From 9a198b07c4d9cbdc5e7f7048f77d808b6dda676a Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 14:29:19 -0800 Subject: [PATCH 182/223] move max #sky.launch to replica manager and limit total # across services --- sky/cli.py | 3 ++ sky/serve/autoscalers.py | 14 +++----- sky/serve/replica_managers.py | 65 ++++++++++++++++++++++++----------- sky/serve/serve_state.py | 16 +++++++++ 4 files changed, 67 insertions(+), 31 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 1e294eb9ec5..b7d4b8b8ebf 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4281,6 +4281,9 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): Each replica can have one of the following statuses: + - ``PENDING``: The maximum number of simultaneous launches has been reached + and the replica launch process is pending. + - ``PROVISIONING``: The replica is being provisioned. - ``STARTING``: Replica provisioning has succeeded and the replica is diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 71cff94ff2d..bd711c59c76 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -17,11 +17,6 @@ logger = sky_logging.init_logger(__name__) -# Since sky.launch is very resource demanding, we limit the number of -# concurrent sky.launch process to avoid overloading the machine. -# TODO(tian): determine this value based on controller resources. -_MAX_NUM_LAUNCH = 5 - class AutoscalerDecisionOperator(enum.Enum): SCALE_UP = 'scale_up' @@ -159,15 +154,14 @@ def evaluate_scaling( logger.info('Bootstrapping service.') self.last_scale_operation = current_time return AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP, - target=min( - self.min_replicas - num_replicas, - _MAX_NUM_LAUNCH)) + target=self.min_replicas - num_replicas) if (self.upper_threshold is not None and requests_per_replica > self.upper_threshold): if num_replicas < self.max_replicas: scale_target = requests_per_replica / self.upper_threshold - num_replicas_to_add = min(max(int(scale_target * num_replicas), - self.min_replicas), self.max_replicas) - num_replicas + num_replicas_to_add = min( + max(int(scale_target * num_replicas), self.min_replicas), + self.max_replicas) - num_replicas if num_replicas_to_add > 0: plural = 's' if num_replicas_to_add > 1 else '' logger.info('Requests per replica is above upper threshold ' diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 0cf63898a10..9a2610628f6 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -11,6 +11,7 @@ import typing from typing import Any, Dict, List, Optional, Tuple +import psutil import requests import sky @@ -40,6 +41,10 @@ _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180 _RETRY_INIT_GAP_SECONDS = 60 +# Since sky.launch is very resource demanding, we limit the number of +# concurrent sky.launch process to avoid overloading the machine. +_MAX_NUM_LAUNCH = psutil.cpu_count() + def launch_cluster(task_yaml_path: str, cluster_name: str, @@ -170,9 +175,8 @@ class ReplicaStatusProperty: first_ready_time: The first time the service is ready. sky_down_status: Process status of sky.down. """ - # Initial value is RUNNING since each `ReplicaInfo` is created - # when `sky.launch` is called. - sky_launch_status: ProcessStatus = ProcessStatus.RUNNING + # None means sky.launch is not called yet. + sky_launch_status: Optional[ProcessStatus] = None user_app_failed: bool = False service_ready_now: bool = False # None means readiness probe is not passed yet. @@ -235,6 +239,9 @@ def should_track_status(self) -> bool: def to_replica_status(self) -> serve_state.ReplicaStatus: """Convert status property to human-readable replica status.""" + if self.sky_launch_status is None: + # Pending to launch + return serve_state.ReplicaStatus.PENDING if self.sky_launch_status == ProcessStatus.RUNNING: # Still launching return serve_state.ReplicaStatus.PROVISIONING @@ -459,7 +466,8 @@ def _launch_replica(self, replica_id: int) -> None: ).run, args=(self._task_yaml_path, cluster_name), ) - p.start() + # Don't start right now; we will start it later in _refresh_process_pool + # to avoid too many sky.launch running at the same time. self._launch_process_pool[replica_id] = p info = ReplicaInfo(replica_id, cluster_name) serve_state.add_or_update_replica(self._service_name, replica_id, info) @@ -559,29 +567,44 @@ def _refresh_process_pool(self) -> None: """ for replica_id, p in list(self._launch_process_pool.items()): if not p.is_alive(): - # TODO(tian): Try-catch in process, and have an enum return - # value to indicate which type of failure happened. - # Currently we only have user code failure since the - # retry_until_up flag is set to True, but it will be helpful - # when we enable user choose whether to retry or not. - logger.info( - f'Launch process for replica {replica_id} finished.') - del self._launch_process_pool[replica_id] info = serve_state.get_replica_info_from_id( self._service_name, replica_id) assert info is not None - if p.exitcode != 0: - logger.warning( - f'Launch process for replica {replica_id} exited ' - f'abnormally with code {p.exitcode}. Terminating...') - info.status_property.sky_launch_status = ( - ProcessStatus.FAILED) - self._terminate_replica(replica_id, sync_down_logs=True) + error_in_sky_launch = False + if info.status == serve_state.ReplicaStatus.PENDING: + # sky.launch not started yet + if (serve_state.total_number_provisioning_replicas() < + _MAX_NUM_LAUNCH): + p.start() + info.status_property.sky_launch_status = ( + ProcessStatus.RUNNING) else: - info.status_property.sky_launch_status = ( - ProcessStatus.SUCCEEDED) + # sky.launch finished + # TODO(tian): Try-catch in process, and have an enum return + # value to indicate which type of failure happened. + # Currently we only have user code failure since the + # retry_until_up flag is set to True, but it will be helpful + # when we enable user choose whether to retry or not. + logger.info( + f'Launch process for replica {replica_id} finished.') + del self._launch_process_pool[replica_id] + if p.exitcode != 0: + logger.warning( + f'Launch process for replica {replica_id} ' + f'exited abnormally with code {p.exitcode}. ' + 'Terminating...') + info.status_property.sky_launch_status = ( + ProcessStatus.FAILED) + error_in_sky_launch = True + else: + info.status_property.sky_launch_status = ( + ProcessStatus.SUCCEEDED) serve_state.add_or_update_replica(self._service_name, replica_id, info) + if error_in_sky_launch: + # Teardown after update replica info since + # _terminate_replica will update the replica info too. + self._terminate_replica(replica_id, sync_down_logs=True) for replica_id, p in list(self._down_process_pool.items()): if not p.is_alive(): logger.info( diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index ab4e772f7e4..00bd511a060 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -52,6 +52,9 @@ class ReplicaStatus(enum.Enum): """Replica status.""" + # The `sky.launch` is pending due to max number of simultaneous launches. + PENDING = 'PENDING' + # The replica VM is being provisioned. i.e., the `sky.launch` is still # running. PROVISIONING = 'PROVISIONING' @@ -97,6 +100,7 @@ def colored_str(self) -> str: _REPLICA_STATUS_TO_COLOR = { + ReplicaStatus.PENDING: colorama.Fore.YELLOW, ReplicaStatus.PROVISIONING: colorama.Fore.BLUE, ReplicaStatus.STARTING: colorama.Fore.CYAN, ReplicaStatus.READY: colorama.Fore.GREEN, @@ -361,3 +365,15 @@ def get_replica_infos( SELECT replica_info FROM replicas WHERE service_name=(?)""", (service_name,)).fetchall() return [pickle.loads(row[0]) for row in rows] + + +def total_number_provisioning_replicas() -> int: + """Returns the total number of provisioning replicas.""" + with db_utils.safe_cursor(_DB_PATH) as cursor: + rows = cursor.execute('SELECT replica_info FROM replicas').fetchall() + provisioning_count = 0 + for row in rows: + replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0]) + if replica_info.status == ReplicaStatus.PROVISIONING: + provisioning_count += 1 + return provisioning_count From 4ae8996946a6dd32217eb907a266473530a77043 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 14:53:29 -0800 Subject: [PATCH 183/223] refactor autoscaler --- sky/serve/autoscalers.py | 90 +++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 51 deletions(-) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index bd711c59c76..761c41053f8 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -148,57 +148,45 @@ def evaluate_scaling( logger.info(f'Requests per replica: {requests_per_replica}') - # Bootstrap case logger.info(f'Number of replicas: {num_replicas}') + target_num_replicas = num_replicas if num_replicas < self.min_replicas: - logger.info('Bootstrapping service.') - self.last_scale_operation = current_time + target_num_replicas = self.min_replicas + elif (self.upper_threshold is not None and + requests_per_replica > self.upper_threshold): + scale_target = requests_per_replica / self.upper_threshold + target_num_replicas = int(scale_target * num_replicas) + elif (self.lower_threshold is not None and + requests_per_replica < self.lower_threshold): + scale_target = requests_per_replica / self.lower_threshold + target_num_replicas = int(scale_target * num_replicas) + + target_num_replicas = max(self.min_replicas, + min(self.max_replicas, target_num_replicas)) + num_replicas_delta = target_num_replicas - num_replicas + if num_replicas_delta == 0: + logger.info('No scaling needed.') + return AutoscalerDecision(AutoscalerDecisionOperator.NO_OP, + target=None) + elif num_replicas_delta > 0: + logger.info(f'Scaling up by {num_replicas_delta} replicas.') return AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP, - target=self.min_replicas - num_replicas) - if (self.upper_threshold is not None and - requests_per_replica > self.upper_threshold): - if num_replicas < self.max_replicas: - scale_target = requests_per_replica / self.upper_threshold - num_replicas_to_add = min( - max(int(scale_target * num_replicas), self.min_replicas), - self.max_replicas) - num_replicas - if num_replicas_to_add > 0: - plural = 's' if num_replicas_to_add > 1 else '' - logger.info('Requests per replica is above upper threshold ' - f'{self.upper_threshold}qps / replica. ' - f'Scaling up by {num_replicas_to_add} ' - f'replica{plural}.') - self.last_scale_operation = current_time - return AutoscalerDecision( - AutoscalerDecisionOperator.SCALE_UP, - target=num_replicas_to_add) - if (self.lower_threshold is not None and - requests_per_replica < self.lower_threshold): - if num_replicas > self.min_replicas: - scale_target = requests_per_replica / self.lower_threshold - num_replicas_to_remove = num_replicas - min( - int(scale_target * num_replicas), self.min_replicas) - if num_replicas_to_remove > 0: - plural = 's' if num_replicas_to_remove > 1 else '' - logger.info('Requests per replica is below lower threshold ' - f'{self.lower_threshold}qps / replica. ' - f'Scaling down by {num_replicas_to_remove} ' - f'replica{plural}.') - self.last_scale_operation = current_time - # Remove FAILED replicas first. - replica_ids_to_remove: List[int] = [] - for info in replica_infos: - if len(replica_ids_to_remove) >= num_replicas_to_remove: - break - if info.status == serve_state.ReplicaStatus.FAILED: - replica_ids_to_remove.append(info.replica_id) - # Then rest of them. - for info in replica_infos: - if len(replica_ids_to_remove) >= num_replicas_to_remove: - break - replica_ids_to_remove.append(info.replica_id) - return AutoscalerDecision( - AutoscalerDecisionOperator.SCALE_DOWN, - target=replica_ids_to_remove) - logger.info('No scaling needed.') - return AutoscalerDecision(AutoscalerDecisionOperator.NO_OP, target=None) + target=num_replicas_delta) + else: + num_replicas_to_remove = -num_replicas_delta + # Remove FAILED replicas first. + replica_ids_to_remove: List[int] = [] + for info in replica_infos: + if len(replica_ids_to_remove) >= num_replicas_to_remove: + break + if info.status == serve_state.ReplicaStatus.FAILED: + replica_ids_to_remove.append(info.replica_id) + # Then rest of them. + for info in replica_infos: + if len(replica_ids_to_remove) >= num_replicas_to_remove: + break + replica_ids_to_remove.append(info.replica_id) + logger.info(f'Scaling down by {num_replicas_to_remove} replicas ' + f'(id: {replica_ids_to_remove}).') + return AutoscalerDecision(AutoscalerDecisionOperator.SCALE_DOWN, + target=replica_ids_to_remove) From 84a41209521b6243830b2f78043dde83c93801d4 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 15:18:59 -0800 Subject: [PATCH 184/223] pass json dict rather than pickle --- sky/serve/autoscalers.py | 22 +++++++++++++--------- sky/serve/controller.py | 6 +----- sky/serve/load_balancer.py | 7 ++----- sky/serve/load_balancing_policies.py | 5 +++++ sky/serve/replica_managers.py | 4 ++++ sky/serve/serve_utils.py | 16 ++++++++-------- 6 files changed, 33 insertions(+), 27 deletions(-) diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py index 761c41053f8..8c0f80d681a 100644 --- a/sky/serve/autoscalers.py +++ b/sky/serve/autoscalers.py @@ -4,12 +4,11 @@ import enum import time import typing -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from sky import sky_logging from sky.serve import constants from sky.serve import serve_state -from sky.serve import serve_utils if typing.TYPE_CHECKING: from sky.serve import replica_managers @@ -67,7 +66,7 @@ def __init__(self, spec: 'service_spec.SkyServiceSpec', 'not always got the latest information.') def collect_request_information( - self, request_aggregator: serve_utils.RequestsAggregator) -> None: + self, request_aggregator_info: Dict[str, Any]) -> None: """Collect request information from aggregator for autoscaling.""" raise NotImplementedError @@ -108,12 +107,17 @@ def __init__(self, spec: 'service_spec.SkyServiceSpec', frequency: int, self.request_timestamps: List[float] = [] def collect_request_information( - self, request_aggregator: serve_utils.RequestsAggregator) -> None: - if not isinstance(request_aggregator, serve_utils.RequestTimestamp): - raise ValueError('Request aggregator must be of type ' - 'serve_utils.RequestTimestamp for ' - 'RequestRateAutoscaler.') - self.request_timestamps.extend(request_aggregator.get()) + self, request_aggregator_info: Dict[str, Any]) -> None: + """Collect request information from aggregator for autoscaling. + + request_aggregator_info should be a dict with the following format: + + { + 'timestamps': [timestamp1 (float), timestamp2 (float), ...] + } + """ + self.request_timestamps.extend( + request_aggregator_info.get('timestamps', [])) current_time = time.time() index = bisect.bisect_left(self.request_timestamps, current_time - self.rps_window_size) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 0e963a396f8..309a2acfb97 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -2,9 +2,7 @@ Responsible for autoscaling and replica management. """ -import base64 import logging -import pickle import threading import time @@ -86,9 +84,7 @@ def run(self) -> None: @self._app.post('/controller/load_balancer_sync') async def load_balancer_sync(request: fastapi.Request): request_data = await request.json() - request_aggregator_payload = request_data.get('request_aggregator') - request_aggregator = pickle.loads( - base64.b64decode(request_aggregator_payload)) + request_aggregator = request_data.get('request_aggregator') logger.info( f'Received inflight request information: {request_aggregator}') self._autoscaler.collect_request_information(request_aggregator) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index a57276ba322..b39bcf5ac2d 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -1,7 +1,5 @@ """LoadBalancer: redirect any incoming request to an endpoint replica.""" -import base64 import logging -import pickle import threading import time @@ -65,9 +63,8 @@ def _sync_with_controller(self): response = session.post( self._controller_url + '/controller/load_balancer_sync', json={ - 'request_aggregator': base64.b64encode( - pickle.dumps(self._request_aggregator) - ).decode('utf-8') + 'request_aggregator': + self._request_aggregator.to_dict() }, timeout=5) # Clean up after reporting request information to avoid OOM. diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index 038d33759c2..27e640e1dca 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -1,4 +1,5 @@ """LoadBalancingPolicy: Policy to select endpoint.""" +import random from typing import List, Optional import fastapi @@ -30,6 +31,10 @@ def __init__(self, *args, **kwargs) -> None: def set_ready_replicas(self, ready_replicas: List[str]) -> None: if set(ready_replicas) != set(self.ready_replicas): + # If the autoscaler keeps scaling up and down the replicas, + # we need this shuffle to not let the first replica have the + # most of the load. + random.shuffle(ready_replicas) self.ready_replicas = ready_replicas self.index = 0 diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 9a2610628f6..9fce0ca7af5 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -46,6 +46,8 @@ _MAX_NUM_LAUNCH = psutil.cpu_count() +# TODO(tian): Combine this with +# sky/spot/recovery_strategy.py::StrategyExecutor::launch def launch_cluster(task_yaml_path: str, cluster_name: str, max_retry: int = 3) -> None: @@ -113,6 +115,8 @@ def launch_cluster(task_yaml_path: str, time.sleep(gap_seconds) +# TODO(tian): Combine this with +# sky/spot/recovery_strategy.py::terminate_cluster def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None: """Terminate the sky serve replica cluster.""" retry_cnt = 0 diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 3e4202e161d..3c6a4174d71 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -120,14 +120,14 @@ def add(self, request: 'fastapi.Request') -> None: """Add a request to the request aggregator.""" raise NotImplementedError - def get(self) -> List[Any]: - """Get all current request aggregator.""" - raise NotImplementedError - def clear(self) -> None: """Clear all current request aggregator.""" raise NotImplementedError + def to_dict(self) -> Dict[str, Any]: + """Convert the aggregator to a dict.""" + raise NotImplementedError + def __repr__(self) -> str: raise NotImplementedError @@ -146,14 +146,14 @@ def add(self, request: 'fastapi.Request') -> None: del request # unused self.timestamps.append(time.time()) - def get(self) -> List[float]: - """Get all current request aggregator.""" - return self.timestamps - def clear(self) -> None: """Clear all current request aggregator.""" self.timestamps = [] + def to_dict(self) -> Dict[str, Any]: + """Convert the aggregator to a dict.""" + return {'timestamps': self.timestamps} + def __repr__(self) -> str: return f'RequestTimestamp(timestamps={self.timestamps})' From b6ef4480adffe904d97cf0fa0229b9fd4044d1d3 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 15:45:56 -0800 Subject: [PATCH 185/223] apply suggestion from code review --- sky/backends/cloud_vm_ray_backend.py | 5 ++- sky/serve/replica_managers.py | 11 +++---- sky/serve/serve_utils.py | 34 -------------------- sky/serve/service.py | 3 +- sky/utils/ux_utils.py | 46 ++++++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 44 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 61e6b04ac33..44a0dd094ef 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -312,11 +312,11 @@ def add_gang_scheduling_placement_group_and_setup( self._has_gang_scheduling = True self._num_nodes = num_nodes - task_cpu_demand = resources_dict.pop('CPU') + bundles = [copy.copy(resources_dict) for _ in range(num_nodes)] # Set CPU to avoid ray hanging the resources allocation # for remote functions, since the task will request 1 CPU # by default. - bundles = [{'CPU': task_cpu_demand} for _ in range(num_nodes)] + task_cpu_demand = resources_dict.pop('CPU') if resources_dict: assert len(resources_dict) == 1, ( @@ -331,7 +331,6 @@ def add_gang_scheduling_placement_group_and_setup( gpu_dict = {} for bundle in bundles: bundle.update({ - **resources_dict, # Set the GPU to avoid ray hanging the resources allocation **gpu_dict, }) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 9fce0ca7af5..7f4d21e7cd0 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -381,7 +381,6 @@ def probe( except requests.exceptions.RequestException as e: logger.info(e) logger.info(f'{replica_identity.capitalize()} is not ready.') - pass return self, False, probe_time @@ -398,9 +397,9 @@ def __init__(self, service_name: str, self._initial_delay_seconds: int = spec.initial_delay_seconds self._post_data: Optional[Dict[str, Any]] = spec.post_data self._uptime: Optional[float] = None - logger.info(f'Readiness probe suffix: {self._readiness_route}') - logger.info(f'Initial delay seconds: {self._initial_delay_seconds}') - logger.info(f'Post data: {self._post_data}') + logger.info(f'Readiness probe suffix: {self._readiness_route}\n' + f'Initial delay seconds: {self._initial_delay_seconds}\n' + f'Post data: {self._post_data}') def get_ready_replica_ips(self) -> List[str]: """Get all ready replica's IP addresses.""" @@ -464,7 +463,7 @@ def _launch_replica(self, replica_id: int) -> None: log_file_name = serve_utils.generate_replica_launch_log_file_name( self._service_name, replica_id) p = multiprocessing.Process( - target=serve_utils.RedirectOutputTo( + target=ux_utils.RedirectOutputForProcess( launch_cluster, log_file_name, ).run, @@ -532,7 +531,7 @@ def _download_and_stream_logs(info: ReplicaInfo): log_file_name = serve_utils.generate_replica_down_log_file_name( self._service_name, replica_id) p = multiprocessing.Process( - target=serve_utils.RedirectOutputTo( + target=ux_utils.RedirectOutputForProcess( terminate_cluster, log_file_name, ).run, diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 3c6a4174d71..bd86d314165 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -8,7 +8,6 @@ import shlex import threading import time -import traceback import typing from typing import (Any, Callable, Dict, Generic, Iterator, List, Optional, TextIO, Type, TypeVar) @@ -158,39 +157,6 @@ def __repr__(self) -> str: return f'RequestTimestamp(timestamps={self.timestamps})' -class RedirectOutputTo: - """Redirect stdout and stderr to a file.""" - - def __init__(self, func: Callable, file: str) -> None: - self.func = func - self.file = file - - def run(self, *args, **kwargs): - import sys # pylint: disable=import-outside-toplevel - - from sky import sky_logging # pylint: disable=import-outside-toplevel - - with open(self.file, 'w') as f: - sys.stdout = f - sys.stderr = f - # reconfigure logger since the logger is initialized before - # with previous stdout/stderr - sky_logging.reload_logger() - logger = sky_logging.init_logger(__name__) - # The subprocess_util.run('sky status') inside - # sky.execution::_execute cannot be redirect, since we cannot - # directly operate on the stdout/stderr of the subprocess. This - # is because some code in skypilot will specify the stdout/stderr - # of the subprocess. - try: - self.func(*args, **kwargs) - except Exception as e: # pylint: disable=broad-except - logger.error(f'Failed to run {self.func.__name__}. ' - f'Details: {common_utils.format_exception(e)}\n' - f'Traceback:\n{traceback.format_exc()}') - raise - - def generate_service_name(): return f'sky-service-{uuid.uuid4().hex[:4]}' diff --git a/sky/serve/service.py b/sky/serve/service.py index 4e37bd25da6..0ca75051f82 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -28,6 +28,7 @@ from sky.serve import serve_utils from sky.utils import common_utils from sky.utils import subprocess_utils +from sky.utils import ux_utils # Use the explicit logger name so that the logger is under the # `sky.serve.service` namespace when executed directly, so as @@ -176,7 +177,7 @@ def _start(service_name: str, task_yaml: str, job_id: int): # service spec and we could start multiple load balancers. # After that, we will have a mapping from replica port to endpoint. load_balancer_process = multiprocessing.Process( - target=serve_utils.RedirectOutputTo( + target=ux_utils.RedirectOutputForProcess( load_balancer.run_load_balancer, load_balancer_log_file).run, args=(controller_addr, load_balancer_port, replica_port)) diff --git a/sky/utils/ux_utils.py b/sky/utils/ux_utils.py index 17176a03282..14c7be937f3 100644 --- a/sky/utils/ux_utils.py +++ b/sky/utils/ux_utils.py @@ -1,9 +1,13 @@ """Utility functions for UX.""" import contextlib import sys +import traceback +from typing import Callable import rich.console as rich_console +from sky import sky_logging +from sky.utils import common_utils from sky.utils import env_options console = rich_console.Console() @@ -53,3 +57,45 @@ def enable_traceback(): sys.tracebacklimit = 1000 yield sys.tracebacklimit = original_tracelimit + + +class RedirectOutputForProcess: + """Redirect stdout and stderr to a file. + + This class enabled output redirect for multiprocessing.Process. + Example usage: + + p = multiprocessing.Process( + target=RedirectOutputForProcess(func, file_name).run, args=...) + + This is equal to: + + p = multiprocessing.Process(target=func, args=...) + + Plus redirect all stdout/stderr to file_name. + """ + + def __init__(self, func: Callable, file: str) -> None: + self.func = func + self.file = file + + def run(self, *args, **kwargs): + with open(self.file, 'w') as f: + sys.stdout = f + sys.stderr = f + # reconfigure logger since the logger is initialized before + # with previous stdout/stderr + sky_logging.reload_logger() + logger = sky_logging.init_logger(__name__) + # The subprocess_util.run('sky status') inside + # sky.execution::_execute cannot be redirect, since we cannot + # directly operate on the stdout/stderr of the subprocess. This + # is because some code in skypilot will specify the stdout/stderr + # of the subprocess. + try: + self.func(*args, **kwargs) + except Exception as e: # pylint: disable=broad-except + logger.error(f'Failed to run {self.func.__name__}. ' + f'Details: {common_utils.format_exception(e)}\n' + f'Traceback:\n{traceback.format_exc()}') + raise From d598ed4d96f314cb79e32508d1a650800f72a64c Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Thu, 9 Nov 2023 15:47:21 -0800 Subject: [PATCH 186/223] Apply suggestions from code review Co-authored-by: Zhanghao Wu --- sky/serve/replica_managers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 7f4d21e7cd0..75e5f401a4c 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -379,7 +379,7 @@ def probe( logger.debug(f'{replica_identity.capitalize()} is ready.') return self, True, probe_time except requests.exceptions.RequestException as e: - logger.info(e) + logger.info(f'common_utils.format_exception(e)') logger.info(f'{replica_identity.capitalize()} is not ready.') return self, False, probe_time @@ -572,7 +572,7 @@ def _refresh_process_pool(self) -> None: if not p.is_alive(): info = serve_state.get_replica_info_from_id( self._service_name, replica_id) - assert info is not None + assert info is not None, replica_id error_in_sky_launch = False if info.status == serve_state.ReplicaStatus.PENDING: # sky.launch not started yet @@ -615,7 +615,7 @@ def _refresh_process_pool(self) -> None: del self._down_process_pool[replica_id] info = serve_state.get_replica_info_from_id( self._service_name, replica_id) - assert info is not None + assert info is not None, replica_id if p.exitcode != 0: logger.error(f'Down process for replica {replica_id} ' f'exited abnormally with code {p.exitcode}.') From 5170225c932b2ab3642974d7caf64d77e998c2a7 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 17:50:04 -0800 Subject: [PATCH 187/223] apply suggestion from code review --- sky/execution.py | 2 ++ sky/serve/replica_managers.py | 55 ++++++++++++++++++++++------------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 8cfa417f6e7..e57315a4ce5 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1107,6 +1107,8 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) + # TODO(tian): Use this to check the endpoint and if the + # service name is duplicated. @usage_lib.entrypoint diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 75e5f401a4c..4a86c6e7c43 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -1,9 +1,9 @@ """ReplicaManager: handles the creation and deletion of endpoint replicas.""" -from concurrent import futures import dataclasses import enum import functools import multiprocessing +from multiprocessing import pool as mp_pool import os import threading import time @@ -20,6 +20,7 @@ from sky import global_user_state from sky import sky_logging from sky import status_lib +from sky.backends import backend_utils from sky.serve import constants as serve_constants from sky.serve import serve_state from sky.serve import serve_utils @@ -238,7 +239,7 @@ def should_track_status(self) -> bool: if self.user_app_failed: return False if self.preempted: - return True + return False return True def to_replica_status(self) -> serve_state.ReplicaStatus: @@ -276,18 +277,20 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: return serve_state.ReplicaStatus.UNKNOWN if self.sky_launch_status == ProcessStatus.FAILED: # sky.launch failed - # Down process should have been started. + # The down process has not been started if it reaches here, + # due to the `if self.sky_down_status is not None`` check above. + # However, it should have been started by _refresh_process_pool. # If not started, this means some bug prevent sky.down from # executing. It is also a potential resource leak, so we mark # it as FAILED_CLEANUP. return serve_state.ReplicaStatus.FAILED_CLEANUP - if self.service_ready_now: - # Service is ready - return serve_state.ReplicaStatus.READY if self.user_app_failed: # Failed on user setup/run - # Same as above + # Same as above, the down process should have been started. return serve_state.ReplicaStatus.FAILED_CLEANUP + if self.service_ready_now: + # Service is ready + return serve_state.ReplicaStatus.READY if self.first_ready_time is not None: # Service was ready before but not now return serve_state.ReplicaStatus.NOT_READY @@ -379,8 +382,8 @@ def probe( logger.debug(f'{replica_identity.capitalize()} is ready.') return self, True, probe_time except requests.exceptions.RequestException as e: - logger.info(f'common_utils.format_exception(e)') - logger.info(f'{replica_identity.capitalize()} is not ready.') + logger.info(f'Error when probe {replica_identity.capitalize()}: ' + f'{common_utils.format_exception(e)}.') return self, False, probe_time @@ -664,7 +667,10 @@ def _process_pool_refresher(self) -> None: except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # process pool refresher running. - logger.error(f'Error in process pool refresher: {e}') + logger.error('Error in process pool refresher: ' + f'{common_utils.format_exception(e)}') + with ux_utils.enable_traceback(): + logger.info(f' Traceback: {traceback.format_exc()}') time.sleep(_PROCESS_POOL_REFRESH_INTERVAL) @with_lock @@ -674,6 +680,10 @@ def _fetch_job_status(self) -> None: This function will monitor the job status of all replicas to make sure the service is running correctly. If any of the replicas failed, it will terminate the replica. + + It is still needed even if we already keep probing the replicas, + since the replica job might launch the API server in the background + (using &), and the readiness probe will not detect the worker failure. """ infos = serve_state.get_replica_infos(self._service_name) for info in infos: @@ -709,7 +719,10 @@ def _job_status_fetcher(self) -> None: except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # job status fetcher running. - logger.error(f'Error in job status fetcher: {e}') + logger.error('Error in job status fetcher: ' + f'{common_utils.format_exception(e)}') + with ux_utils.enable_traceback(): + logger.info(f' Traceback: {traceback.format_exc()}') time.sleep(_JOB_STATUS_FETCH_INTERVAL) @with_lock @@ -724,7 +737,7 @@ def _probe_all_replicas(self) -> None: """ probe_futures = [] replica_to_probe = [] - with futures.ThreadPoolExecutor() as executor: + with mp_pool.ThreadPool() as pool: infos = serve_state.get_replica_infos(self._service_name) for info in infos: if not info.status_property.should_track_status(): @@ -732,19 +745,16 @@ def _probe_all_replicas(self) -> None: replica_to_probe.append( f'replica_{info.replica_id}(ip={info.ip})') probe_futures.append( - executor.submit( - info.probe, - self._readiness_route, - self._post_data, - )) + pool.apply_async(info.probe, + (self._readiness_route, self._post_data))) logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}') # Since futures.as_completed will return futures in the order of # completion, we need the info.probe function to return the info # object as well, so that we could update the info object in the # same order. - for future in futures.as_completed(probe_futures): - future_result: Tuple[ReplicaInfo, bool, float] = future.result() + for future in probe_futures: + future_result: Tuple[ReplicaInfo, bool, float] = future.get() info, probe_succeeded, probe_time = future_result info.status_property.service_ready_now = probe_succeeded should_teardown = False @@ -771,7 +781,7 @@ def _probe_all_replicas(self) -> None: # from the cloud provider to # determine whether the cluster is preempted. (cluster_status, - _) = backends.backend_utils.refresh_cluster_status_handle( + _) = backend_utils.refresh_cluster_status_handle( info.cluster_name, force_refresh_statuses=set(status_lib.ClusterStatus)) @@ -845,5 +855,8 @@ def _replica_prober(self) -> None: except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # replica prober running. - logger.error(f'Error in replica prober: {e}') + logger.error('Error in replica prober: ' + f'{common_utils.format_exception(e)}') + with ux_utils.enable_traceback(): + logger.info(f' Traceback: {traceback.format_exc()}') time.sleep(serve_constants.ENDPOINT_PROBE_INTERVAL_SECONDS) From 8cea486b73bf02181e99b07a86e1302eb3086123 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 22:32:05 -0800 Subject: [PATCH 188/223] bug fix & apply suggestion from code review --- sky/backends/cloud_vm_ray_backend.py | 11 + sky/execution.py | 8 +- sky/serve/load_balancing_policies.py | 2 + sky/serve/replica_managers.py | 184 +++++++-------- sky/serve/serve_state.py | 250 ++++++++++----------- sky/serve/serve_utils.py | 5 +- sky/skylet/constants.py | 2 +- sky/templates/sky-serve-controller.yaml.j2 | 2 +- sky/templates/spot-controller.yaml.j2 | 2 +- sky/utils/schemas.py | 1 + sky/utils/subprocess_utils.py | 9 +- tests/test_smoke.py | 12 +- 12 files changed, 256 insertions(+), 232 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 44a0dd094ef..08ea12fce24 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3851,6 +3851,17 @@ def tail_spot_logs(self, def tail_serve_logs(self, handle: CloudVmRayResourceHandle, service_name: str, target: serve_lib.ServiceComponent, replica_id: Optional[int], follow: bool) -> None: + """Tail the logs of a service. + + Args: + handle: The handle to the sky serve controller. + service_name: The name of the service. + target: The component to tail the logs of. Could be controller, + load balancer, or replica. + replica_id: The replica ID to tail the logs of. Only used when + target is replica. + follow: Whether to follow the logs. + """ if target != serve_lib.ServiceComponent.REPLICA: code = serve_lib.ServeCodeGen.stream_serve_process_logs( service_name, diff --git a/sky/execution.py b/sky/execution.py index e57315a4ce5..0328e07d833 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -702,7 +702,7 @@ def _controller_skypilot_config_setup( err=common_utils.format_exception(e, use_bracket=True))) from e - vars_to_fill['envs'] = controller_envs + vars_to_fill['controller_envs'] = controller_envs return vars_to_fill, controller_resources @@ -1091,9 +1091,9 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: cloud=controller_cloud, ports=[serve.LOAD_BALANCER_PORT_RANGE]) controller_task.set_resources(controller_resources) - # Set this to modify default ray task CPU usage to custom value - # instead of default 0.5 vCPU. We need to set it to a smaller value - # to support a larger number of services. + # # Set service_name so the backend will know to modify default ray + # task CPU usage to custom value instead of default 0.5 vCPU. We need + # to set it to a smaller value to support a larger number of services. controller_task.service_name = service_name print(f'{colorama.Fore.YELLOW}Launching controller for ' diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index 27e640e1dca..76f6f42ab7d 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -18,6 +18,8 @@ def __init__(self) -> None: def set_ready_replicas(self, ready_replicas: List[str]) -> None: raise NotImplementedError + # TODO(tian): We should have an abstract class for Request to + # compatible with all frameworks. def select_replica(self, request: fastapi.Request) -> Optional[str]: raise NotImplementedError diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 4a86c6e7c43..aa4ca187df0 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -382,7 +382,7 @@ def probe( logger.debug(f'{replica_identity.capitalize()} is ready.') return self, True, probe_time except requests.exceptions.RequestException as e: - logger.info(f'Error when probe {replica_identity.capitalize()}: ' + logger.info(f'Error when probing {replica_identity}: ' f'{common_utils.format_exception(e)}.') return self, False, probe_time @@ -747,98 +747,102 @@ def _probe_all_replicas(self) -> None: probe_futures.append( pool.apply_async(info.probe, (self._readiness_route, self._post_data))) - logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}') - - # Since futures.as_completed will return futures in the order of - # completion, we need the info.probe function to return the info - # object as well, so that we could update the info object in the - # same order. - for future in probe_futures: - future_result: Tuple[ReplicaInfo, bool, float] = future.get() - info, probe_succeeded, probe_time = future_result - info.status_property.service_ready_now = probe_succeeded - should_teardown = False - if probe_succeeded: - if self._uptime is None: - self._uptime = probe_time - logger.info(f'Replica {info.replica_id} is the first ready ' - f'replica. Setting uptime to {self._uptime}.') - serve_state.set_service_uptime(self._service_name, - int(self._uptime)) - info.consecutive_failure_times.clear() - if info.status_property.first_ready_time is None: - info.status_property.first_ready_time = probe_time - else: - handle = info.handle - if handle is None: - logger.error('Cannot find handle for ' - f'replica {info.replica_id}.') - elif handle.launched_resources is None: - logger.error('Cannot find launched_resources in handle' - f' for replica {info.replica_id}.') - elif handle.launched_resources.use_spot: - # Pull the actual cluster status - # from the cloud provider to - # determine whether the cluster is preempted. - (cluster_status, - _) = backend_utils.refresh_cluster_status_handle( - info.cluster_name, - force_refresh_statuses=set(status_lib.ClusterStatus)) - - if cluster_status != status_lib.ClusterStatus.UP: - # The cluster is (partially) preempted. - # It can be down, INIT or STOPPED, based on the - # interruption behavior of the cloud. - # Spot recovery is needed. - cluster_status_str = ( - '' if cluster_status is None else - f' (status: {cluster_status.value})') - logger.info(f'Replica {info.replica_id} ' - f'is preempted{cluster_status_str}.') - self._recover_from_preemption(info.replica_id) - - continue - - if info.first_not_ready_time is None: - info.first_not_ready_time = probe_time - if info.status_property.first_ready_time is not None: - info.consecutive_failure_times.append(probe_time) - consecutive_failure_time = ( - info.consecutive_failure_times[-1] - - info.consecutive_failure_times[0]) - if (consecutive_failure_time >= - _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT): - logger.info( - f'Replica {info.replica_id} is not ready for ' - 'too long and exceeding consecutive failure ' - 'threshold. Terminating the replica...') - should_teardown = True - else: + logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}') + + # Since futures.as_completed will return futures in the order of + # completion, we need the info.probe function to return the info + # object as well, so that we could update the info object in the + # same order. + for future in probe_futures: + future_result: Tuple[ReplicaInfo, bool, float] = future.get() + info, probe_succeeded, probe_time = future_result + info.status_property.service_ready_now = probe_succeeded + should_teardown = False + if probe_succeeded: + if self._uptime is None: + self._uptime = probe_time logger.info( - f'Replica {info.replica_id} is not ready ' - 'but within consecutive failure threshold ' - f'({consecutive_failure_time}s / ' - f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' - 'Skipping.') + f'Replica {info.replica_id} is the first ready ' + f'replica. Setting uptime to {self._uptime}.') + serve_state.set_service_uptime(self._service_name, + int(self._uptime)) + info.consecutive_failure_times.clear() + if info.status_property.first_ready_time is None: + info.status_property.first_ready_time = probe_time else: - current_delay_seconds = (probe_time - - info.first_not_ready_time) - if current_delay_seconds > self._initial_delay_seconds: - logger.info( - f'Replica {info.replica_id} is not ready and ' - 'exceeding initial delay seconds. Terminating ' - 'the replica...') - should_teardown = True + handle = info.handle + if handle is None: + logger.error('Cannot find handle for ' + f'replica {info.replica_id}.') + elif handle.launched_resources is None: + logger.error('Cannot find launched_resources in ' + f'handle for replica {info.replica_id}.') + elif handle.launched_resources.use_spot: + # Pull the actual cluster status + # from the cloud provider to + # determine whether the cluster is preempted. + (cluster_status, + _) = backend_utils.refresh_cluster_status_handle( + info.cluster_name, + force_refresh_statuses=set( + status_lib.ClusterStatus)) + + if cluster_status != status_lib.ClusterStatus.UP: + # The cluster is (partially) preempted. + # It can be down, INIT or STOPPED, based on the + # interruption behavior of the cloud. + # Spot recovery is needed. + cluster_status_str = ( + '' if cluster_status is None else + f' (status: {cluster_status.value})') + logger.info(f'Replica {info.replica_id} ' + f'is preempted{cluster_status_str}.') + self._recover_from_preemption(info.replica_id) + + continue + + if info.first_not_ready_time is None: + info.first_not_ready_time = probe_time + if info.status_property.first_ready_time is not None: + info.consecutive_failure_times.append(probe_time) + consecutive_failure_time = ( + info.consecutive_failure_times[-1] - + info.consecutive_failure_times[0]) + if (consecutive_failure_time >= + _CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT): + logger.info( + f'Replica {info.replica_id} is not ready for ' + 'too long and exceeding consecutive failure ' + 'threshold. Terminating the replica...') + should_teardown = True + else: + logger.info( + f'Replica {info.replica_id} is not ready ' + 'but within consecutive failure threshold ' + f'({consecutive_failure_time}s / ' + f'{_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT}s). ' + 'Skipping.') else: - current_delay_seconds = int(current_delay_seconds) - logger.info( - f'Replica {info.replica_id} is not ready but within' - f' initial delay seconds ({current_delay_seconds}s ' - f'/ {self._initial_delay_seconds}s). Skipping.') - serve_state.add_or_update_replica(self._service_name, - info.replica_id, info) - if should_teardown: - self._terminate_replica(info.replica_id, sync_down_logs=True) + current_delay_seconds = (probe_time - + info.first_not_ready_time) + if current_delay_seconds > self._initial_delay_seconds: + logger.info( + f'Replica {info.replica_id} is not ready and ' + 'exceeding initial delay seconds. Terminating ' + 'the replica...') + should_teardown = True + else: + current_delay_seconds = int(current_delay_seconds) + logger.info(f'Replica {info.replica_id} is not ' + 'ready but within initial delay ' + f'seconds ({current_delay_seconds}s ' + f'/ {self._initial_delay_seconds}s). ' + 'Skipping.') + serve_state.add_or_update_replica(self._service_name, + info.replica_id, info) + if should_teardown: + self._terminate_replica(info.replica_id, + sync_down_logs=True) def _replica_prober(self) -> None: """Periodically probe replicas.""" diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 00bd511a060..376d90c0e39 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -21,29 +21,32 @@ _DB_PATH.parents[0].mkdir(parents=True, exist_ok=True) _DB_PATH = str(_DB_PATH) -# Module-level connection/cursor; thread-safe as the module is only imported -# once. -_CONN = sqlite3.connect(_DB_PATH) -_CURSOR = _CONN.cursor() - -_CURSOR.execute("""\ - CREATE TABLE IF NOT EXISTS services ( - name TEXT PRIMARY KEY, - controller_job_id INTEGER DEFAULT NULL, - controller_port INTEGER DEFAULT NULL, - load_balancer_port INTEGER DEFAULT NULL, - status TEXT, - uptime INTEGER DEFAULT NULL, - policy TEXT DEFAULT NULL, - auto_restart INTEGER DEFAULT NULL, - requested_resources BLOB DEFAULT NULL)""") -_CURSOR.execute("""\ - CREATE TABLE IF NOT EXISTS replicas ( - service_name TEXT, - replica_id INTEGER, - replica_info BLOB, - PRIMARY KEY (service_name, replica_id))""") -_CONN.commit() + +def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None: + """Creates the service and replica tables if they do not exist.""" + + cursor.execute("""\ + CREATE TABLE IF NOT EXISTS services ( + name TEXT PRIMARY KEY, + controller_job_id INTEGER DEFAULT NULL, + controller_port INTEGER DEFAULT NULL, + load_balancer_port INTEGER DEFAULT NULL, + status TEXT, + uptime INTEGER DEFAULT NULL, + policy TEXT DEFAULT NULL, + auto_restart INTEGER DEFAULT NULL, + requested_resources BLOB DEFAULT NULL)""") + cursor.execute("""\ + CREATE TABLE IF NOT EXISTS replicas ( + service_name TEXT, + replica_id INTEGER, + replica_info BLOB, + PRIMARY KEY (service_name, replica_id))""") + + conn.commit() + + +_DB = db_utils.SQLiteConn(_DB_PATH, create_table) _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG = 'UNIQUE constraint failed: services.name' @@ -87,7 +90,7 @@ class ReplicaStatus(enum.Enum): # The replica is a spot VM and it is preempted by the cloud provider. PREEMPTED = 'PREEMPTED' - # Unknown status. This should never happen. + # Unknown. This should never happen (used only for unexpected errors). UNKNOWN = 'UNKNOWN' @classmethod @@ -181,17 +184,17 @@ def from_replica_statuses( # === Service functions === def add_service_if_not_exist(name: str) -> bool: """Adds a service to the database.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - try: - cursor.execute( - """\ - INSERT INTO services (name, status) - VALUES (?, ?)""", (name, ServiceStatus.CONTROLLER_INIT.value)) - except sqlite3.IntegrityError as e: - if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG: - raise RuntimeError('Unexpected database error') from e - return False - return True + try: + _DB.cursor.execute( + """\ + INSERT INTO services (name, status) + VALUES (?, ?)""", (name, ServiceStatus.CONTROLLER_INIT.value)) + _DB.conn.commit() + except sqlite3.IntegrityError as e: + if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG: + raise RuntimeError('Unexpected database error') from e + return False + return True def add_or_update_service(name: str, controller_job_id: int, policy: str, @@ -199,62 +202,61 @@ def add_or_update_service(name: str, controller_job_id: int, policy: str, requested_resources: 'sky.Resources', status: ServiceStatus) -> None: """Updates a service in the database.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - cursor.execute( - """\ - INSERT OR REPLACE INTO services - (name, controller_job_id, status, policy, - auto_restart, requested_resources) - VALUES (?, ?, ?, ?, ?, ?)""", - (name, controller_job_id, status.value, policy, int(auto_restart), - pickle.dumps(requested_resources))) + _DB.cursor.execute( + """\ + INSERT OR REPLACE INTO services + (name, controller_job_id, status, policy, + auto_restart, requested_resources) + VALUES (?, ?, ?, ?, ?, ?)""", + (name, controller_job_id, status.value, policy, int(auto_restart), + pickle.dumps(requested_resources))) + _DB.conn.commit() def remove_service(service_name: str) -> None: """Removes a service from the database.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - cursor.execute("""\ - DELETE FROM services WHERE name=(?)""", (service_name,)) + _DB.cursor.execute("""\ + DELETE FROM services WHERE name=(?)""", (service_name,)) + _DB.conn.commit() def set_service_uptime(service_name: str, uptime: int) -> None: """Sets the uptime of a service.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - cursor.execute( - """\ - UPDATE services SET - uptime=(?) WHERE name=(?)""", (uptime, service_name)) + _DB.cursor.execute( + """\ + UPDATE services SET + uptime=(?) WHERE name=(?)""", (uptime, service_name)) + _DB.conn.commit() def set_service_status(service_name: str, status: ServiceStatus) -> None: """Sets the service status.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - cursor.execute( - """\ - UPDATE services SET - status=(?) WHERE name=(?)""", (status.value, service_name)) + _DB.cursor.execute( + """\ + UPDATE services SET + status=(?) WHERE name=(?)""", (status.value, service_name)) + _DB.conn.commit() def set_service_controller_port(service_name: str, controller_port: int) -> None: """Sets the controller port of a service.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - cursor.execute( - """\ - UPDATE services SET - controller_port=(?) WHERE name=(?)""", - (controller_port, service_name)) + _DB.cursor.execute( + """\ + UPDATE services SET + controller_port=(?) WHERE name=(?)""", (controller_port, service_name)) + _DB.conn.commit() def set_service_load_balancer_port(service_name: str, load_balancer_port: int) -> None: """Sets the load balancer port of a service.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - cursor.execute( - """\ - UPDATE services SET - load_balancer_port=(?) WHERE name=(?)""", - (load_balancer_port, service_name)) + _DB.cursor.execute( + """\ + UPDATE services SET + load_balancer_port=(?) WHERE name=(?)""", + (load_balancer_port, service_name)) + _DB.conn.commit() def _get_service_from_row(row) -> Dict[str, Any]: @@ -276,22 +278,20 @@ def _get_service_from_row(row) -> Dict[str, Any]: def get_services() -> List[Dict[str, Any]]: """Get all existing service records.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - rows = cursor.execute('SELECT * FROM services').fetchall() - records = [] - for row in rows: - records.append(_get_service_from_row(row)) - return records + rows = _DB.cursor.execute('SELECT * FROM services').fetchall() + records = [] + for row in rows: + records.append(_get_service_from_row(row)) + return records def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]: """Get all existing service records.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - rows = cursor.execute('SELECT * FROM services WHERE name=(?)', + rows = _DB.cursor.execute('SELECT * FROM services WHERE name=(?)', (service_name,)).fetchall() - for row in rows: - return _get_service_from_row(row) - return None + for row in rows: + return _get_service_from_row(row) + return None def get_glob_service_names( @@ -305,75 +305,71 @@ def get_glob_service_names( Returns: A list of non-duplicated service names. """ - with db_utils.safe_cursor(_DB_PATH) as cursor: - if service_names is None: - rows = cursor.execute('SELECT name FROM services').fetchall() - else: - rows = [] - for service_name in service_names: - rows.extend( - cursor.execute( - 'SELECT name FROM services WHERE name GLOB (?)', - (service_name,)).fetchall()) - return list({row[0] for row in rows}) + if service_names is None: + rows = _DB.cursor.execute('SELECT name FROM services').fetchall() + else: + rows = [] + for service_name in service_names: + rows.extend( + _DB.cursor.execute( + 'SELECT name FROM services WHERE name GLOB (?)', + (service_name,)).fetchall()) + return list({row[0] for row in rows}) # === Replica functions === def add_or_update_replica(service_name: str, replica_id: int, replica_info: 'replica_managers.ReplicaInfo') -> None: """Adds a replica to the database.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - cursor.execute( - """\ - INSERT OR REPLACE INTO replicas - (service_name, replica_id, replica_info) - VALUES (?, ?, ?)""", - (service_name, replica_id, pickle.dumps(replica_info))) + _DB.cursor.execute( + """\ + INSERT OR REPLACE INTO replicas + (service_name, replica_id, replica_info) + VALUES (?, ?, ?)""", + (service_name, replica_id, pickle.dumps(replica_info))) + _DB.conn.commit() def remove_replica(service_name: str, replica_id: int) -> None: """Removes a replica from the database.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - cursor.execute( - """\ - DELETE FROM replicas - WHERE service_name=(?) - AND replica_id=(?)""", (service_name, replica_id)) + _DB.cursor.execute( + """\ + DELETE FROM replicas + WHERE service_name=(?) + AND replica_id=(?)""", (service_name, replica_id)) + _DB.conn.commit() def get_replica_info_from_id( service_name: str, replica_id: int) -> Optional['replica_managers.ReplicaInfo']: """Gets a replica info from the database.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - rows = cursor.execute( - """\ - SELECT replica_info FROM replicas - WHERE service_name=(?) - AND replica_id=(?)""", (service_name, replica_id)).fetchall() - for row in rows: - return pickle.loads(row[0]) - return None + rows = _DB.cursor.execute( + """\ + SELECT replica_info FROM replicas + WHERE service_name=(?) + AND replica_id=(?)""", (service_name, replica_id)).fetchall() + for row in rows: + return pickle.loads(row[0]) + return None def get_replica_infos( service_name: str) -> List['replica_managers.ReplicaInfo']: """Gets all replica infos of a service.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - rows = cursor.execute( - """\ - SELECT replica_info FROM replicas - WHERE service_name=(?)""", (service_name,)).fetchall() - return [pickle.loads(row[0]) for row in rows] + rows = _DB.cursor.execute( + """\ + SELECT replica_info FROM replicas + WHERE service_name=(?)""", (service_name,)).fetchall() + return [pickle.loads(row[0]) for row in rows] def total_number_provisioning_replicas() -> int: """Returns the total number of provisioning replicas.""" - with db_utils.safe_cursor(_DB_PATH) as cursor: - rows = cursor.execute('SELECT replica_info FROM replicas').fetchall() - provisioning_count = 0 - for row in rows: - replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0]) - if replica_info.status == ReplicaStatus.PROVISIONING: - provisioning_count += 1 - return provisioning_count + rows = _DB.cursor.execute('SELECT replica_info FROM replicas').fetchall() + provisioning_count = 0 + for row in rows: + replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0]) + if replica_info.status == ReplicaStatus.PROVISIONING: + provisioning_count += 1 + return provisioning_count diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index bd86d314165..79f3277c740 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -520,8 +520,9 @@ def _service_is_terminal() -> bool: return '' -# TODO(tian): Use REST API instead of SSH in the future. This will require -# authentication. +# TODO(tian): Use REST API instead of SSH in the future. This codegen pattern +# is to reuse the authentication of ssh. If we want to use REST API, we need +# to implement some authentication mechanism. class ServeCodeGen: """Code generator for SkyServe. diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index f94034ca986..52f3fd9c600 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -44,7 +44,7 @@ # e.g., when we add new events to skylet, or we fix a bug in skylet. # # TODO(zongheng,zhanghao): make the upgrading of skylet automatic? -SKYLET_VERSION = '5' +SKYLET_VERSION = '4' SKYLET_VERSION_FILE = '~/.sky/skylet_version' # `sky spot dashboard`-related diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index f7e7c6ae2f7..5648c08516c 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -27,6 +27,6 @@ run: | > {{controller_log_file}} 2>&1 envs: -{%- for env_name, env_value in envs.items() %} +{%- for env_name, env_value in controller_envs.items() %} {{env_name}}: {{env_value}} {%- endfor %} diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index 26f9e22f65d..5bc892e8365 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -38,6 +38,6 @@ run: | --job-id $SKYPILOT_INTERNAL_JOB_ID {% if retry_until_up %}--retry-until-up{% endif %} envs: -{%- for env_name, env_value in envs.items() %} +{%- for env_name, env_value in controller_envs.items() %} {{env_name}}: {{env_value}} {%- endfor %} diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index ac523b00c5e..2b442473275 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -156,6 +156,7 @@ def get_storage_schema(): def get_service_schema(): + """Schema for top-level `service:` field (for SkyServe).""" return { '$schema': 'https://json-schema.org/draft/2020-12/schema', 'type': 'object', diff --git a/sky/utils/subprocess_utils.py b/sky/utils/subprocess_utils.py index a77a840b89a..0a53f6b11d8 100644 --- a/sky/utils/subprocess_utils.py +++ b/sky/utils/subprocess_utils.py @@ -99,7 +99,7 @@ def kill_children_processes( This is for guaranteeing the order of cleaning up and suppress flaky errors. """ - pid2proc = dict() + pid_to_proc = dict() child_processes = [] if isinstance(first_pid_to_kill, int): first_pid_to_kill = [first_pid_to_kill] @@ -120,12 +120,13 @@ def _kill_processes(processes: List[psutil.Process]) -> None: parent_process = psutil.Process() for child in parent_process.children(recursive=True): if child.pid in first_pid_to_kill: - pid2proc[child.pid] = child + pid_to_proc[child.pid] = child else: child_processes.append(child) - _kill_processes( - [pid2proc[proc] for proc in first_pid_to_kill if proc in pid2proc]) + _kill_processes([ + pid_to_proc[proc] for proc in first_pid_to_kill if proc in pid_to_proc + ]) _kill_processes(child_processes) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index f523d4887a2..0609b831753 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -180,7 +180,7 @@ def run_one_test(test: Test) -> Tuple[int, str, str]: test.teardown, stdout=log_file, stderr=subprocess.STDOUT, - timeout=15 * 60, # 15 mins + timeout=10 * 60, # 10 mins shell=True, ) @@ -2874,7 +2874,15 @@ def terminate_replica(replica_id: int) -> str: # be restarted f'sleep 20', terminate_replica(1), - 'sleep 180', # Wait for consecutive failure timeout passed. + # Wait for consecutive failure timeout passed. + # If the cluster is not using spot, it won't check the cluster status + # on the cloud (since manual shutdown is not a common behavior and such + # queries takes a lot of time). Instead, we think continuous 3 min probe + # failure is not a temporary problem but indeed a failure. + 'sleep 180', + # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time + # that the output of `sky serve status` shows FAILED and this status will + # cause _SERVE_WAIT_UNTIL_READY to early quit. '(while true; do' f' output=$(sky serve status {name});' ' echo "$output" | grep -q "1/1" && break;' From 8711f458d3b846ab92b50bc863913968a5ddd51f Mon Sep 17 00:00:00 2001 From: cblmemo Date: Thu, 9 Nov 2023 23:43:46 -0800 Subject: [PATCH 189/223] apply suggestions --- docs/source/images/sky-serve-architecture.png | Bin 0 -> 624835 bytes sky/backends/cloud_vm_ray_backend.py | 42 ++---------------- sky/execution.py | 33 ++++++++++++++ sky/serve/README.md | 8 ++-- sky/serve/controller.py | 8 +++- sky/serve/replica_managers.py | 36 ++++++++++----- sky/serve/serve_utils.py | 18 ++++---- sky/serve/service.py | 6 ++- sky/utils/cli_utils/status_utils.py | 1 + sky/utils/ux_utils.py | 6 ++- 10 files changed, 93 insertions(+), 65 deletions(-) create mode 100644 docs/source/images/sky-serve-architecture.png diff --git a/docs/source/images/sky-serve-architecture.png b/docs/source/images/sky-serve-architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..9312ccad25c41f662b510f3ec68a5c2dac76ee0a GIT binary patch literal 624835 zcmeFacRbc_`#&xrd+&LXSw>~=9c6}4_8y7skvf*zF`mcscpm2+sjaC(ginu;fq_AEMpa1%1A~+e z0|U1O7Zd)a^Z2cg7#R4E9TXL{&nPOQwB1~69h_`1FjOP&B^=Sa`-%M8!d`AT7A799 zD&9-HPCOJx2({y+vYrC@9c@d(5L*LkgDZS_5!Z-JsLowEA61xT`Kp{-WR~D*1ow=k zyq?|=3FGw_3)O3FDXHSiHmAI8=QFOlb7I6g80QNe&cTS~c^j{cv;JY6f_&dp0d2d`HksDtvU-2ml+S{&;I%XF8`Y^^ADz(0S%!bV$vx7BV zFP;d)k!4J@r8WNMvQ3D?sciWWmRqG)?u5S)x4GUi8JRe#N?o6fF>=PZNF5jBE$vd5 zyN?1p$y*u=BRn%+d0Jl*DM3M+?_{T^G2PiuM=kmUo^J8>4GDUel-2xp0-s8%G6C<^ zR#a&=-N5$Vvg@|X+xM|rJtvgo*zaJUW}}`WsLH50VfgA!T*)F2^)lBOmmxbzbdseh z^>Npz4dY?+*Po+HVtF*Fwo0jdpML7luZ=%>nE3id*9(xB zP;C$>b-3!L9=%DFGFJE_Z)A!#zFy$2%CSuvRZRb!na$&?=}9Ytfv=rje|9?Ok!iUw zFtv+K{?((c&_~(DA6-1`p&rdyE|(TC z>ekcJq{tWMwFZ<`56jLyNj^7_sW*g;X(?50OO3amJ#JORx&DlryoMu#Duf z`>2b?5)$dLL-Gam`PHNsW@_&HWY<>@sQxCxfp%=5rR?WD~F76i^JcIIz|9CQe&s#tyiO$*jmgashtq;#6Jwca zn0G~w?HCn@BA+7FgBIW72} z_9m)|EZ|T8!(cG8LQq5ONeSBH(F|cRkxjTKD-+q&>1<_l4!M)L({zPzTzhRPTw-fT ztdp7KB(OI&ymJg}UPMlS_$A5fA`#bug3)piu9cIY94N149&F6s@F>@ z*2&b#aWkF>Ye9v|oqSlrZQOF|EXM@*SB{FE$Gs*Q+U5T0|5Kc|bPtKq> zd#ruA__J@B>9HtBC7W};-96m_-8Z^r=;LiqTXl8Xb38lxjQDeGRdUq>ze2xszlWO+ zE6zl5Vfi>lM-{Dt-<*uON}pjj*=F8G`rc6Tf%BWkbVri~gOxH=oOWc;Y;ki0(+)MYd zr;F&ry=muO>0-&e-22}iC*)Kp7oT^L%P3gL7x*}w6?A%!!z8UO+wQ}=DVj+|90?pW zj$5!a4inCUFx#+`VUGxBh%#u&VrWn2pEipA932+D5~CQCK-)_%E5w?3LS2coU;VSj zY4uk1arNFrk))avT|!nVREa&w(jr_bSCa%z6^qnbq07t-vyEm;o;%G4%3AtwbV{ZB%ls?#q^NxH7)e3~n* z{Uq_pwr{>Pz&h8K#=4@V==J>Lmk*`eYdUvt9{%)d*rHOgw1Gk8xC(k)vuR6HFIPKP zSo5-`S(-_zO`B)iVg5J#Z~4#jn-J0mHyAcOpN(=|aW$FuYgxIQQfgG*f3l)>;PGJB zAW3CyqLqls3A^D`QxZjzq9*2B&RSJ=Pro_0I}02ZC^9JP6?8f3QnMDg5SW847~`9` z@ww@v%Ga(D!Fo3MgJJ{EgS^=hvlIM1Rn1_HL43i(&H;i!nst5fTqj z_PzdSv)m^xU)_PK!GwIaf&Ah=&<> zL#*ChXxtFSWOx;s7wyCBKvT*7R62^j3^lFY9d4g7Nz`%6^1aoUlb4?ydvfN9@ROsi zHLed`2bW666W*Aqeb6N365<*ox1w0%B5q$vdEx17c1Ks|T<1$)+vV?HO_GYGWvzCn zu#XWuC%DOFb!Yqz2}jUqnERv32A@5xM#6<@|p7=<_m-{ z*SFRmx}A`xF2rpcdA!l36SK?fVN7;-E{iJ5yNGvr?TvPvcIk0>XF54(sHgef8};#& z-Jw47f~J!1bCta+HE(|gwoU6h$F>~Sh7Qll%Izb*S~u-9ZQ7a!`Ub{*3w-ZX&uqkR zTw2~D*d_=xjW&I6G*I4X{O7R^-i%RrGmj_`1y`nXx^CfLoJ(tdzw9;3vM%QGgWx`Efn@YeDTP- z_uJsu;MyA2tV5y5WB&ddgDItJ8#R3abFzYgMW2da)>`lN9=>yU@W@dT(>&MweDQ2| zdqqCuP)l{5b}^aJ*%WtOfhza`Z`*|&< zE*iV^UoThrHr_s5N$Prk+;Z{tyM%3lG#9(_jLKQpjc%%n<6On3ibPD@Yd-o6PWd&s zXx0jQh}{`Z^?z%122Z}b@v2h1ns>J zjy~eT{8frE;Mv&8udYiJEq9;hN@zU_s#t$n(_gveam%HB#qL}?^U8NWvz^;3O$jyi z-@bga+1cC?7ML|lrC(%uzTr0iuEwC#;6bo*P{HnnmCxHNk+s!(RWg*a-D{^;ZD;4I zye}>gS9|*pi@c``++`9rjpqpC*vdLT6YY(>{z3z>7c{R*@%jRax znaWf1jojB1C^a>(N9GP;G~;1tyI{(tN{Z`h;R4u!IJ~V=&pCd9qawJrgOn}v@V|b>Z9#tZDHgpr{DpT#^@2MF1~mur z=g=7)_C*W~IgB$(@_IgpW(ThhoY&nITXYGX3X4^UVk0}NM3caOmWC&R+%k5+Zop0f zN0G4pgpQ(~ZEVhfLe|!Eb@CQeLX>gmhmh<$K7lg^?JhxUE*ZJG%f@ZKZOcDof_xsY ze7aOzwt98JZz6T&(<2>I)1kd}Qk2{^j6;}Mxc|)$yWfJ2DgOWYAg&W>tnJ8||34r7 zj-5^CL-}95=FeY7`yX<^VaEP1mxz28WhR%5Rzd&Q+k;EmOase2_rGQSKX>4Nr}=-o z-T#dCA3XhkzWEOU?SCQYAL91^LeT$05bVJJBJKYo?f<~u{~ZeU>x2I#l>a3ZWCy-u zm!stOYsOAW!wToYis8T)man6F9vKv0+`I6?Yp#7pcJxEbz2ld<^KRQ3B$NF=i-11_ z9&Q>;wRqlqgJRQ%XYZ98?&VkQQha>dU3kGg<#PEaul~!&4OZWLb93d=xP!by6bjXv ziEEflj*`o)CkUau{`v+@(CR010ifHIVXSm0yg@rudi?A2@-c?b?8|F z-iyyBU#?_qEPoR)EHNLg4>b$iuvyKuQ2Ga$_H%Qn&S#!`aQfLbiP=I`S9yazpdJ7UabTXPka;##H1+wAYc|M3)5i7)^s z-uZ|PD6%stv25E?#V>z;Hr16IP8n?JZBXXeQ{_1`U(!HacBz+mZxD{Z;=Z@X0U(b{4+x+hz9{hqfwh zTa&?f!`H!N4GQ;o&Gs&oJALXk3);!p+y0hd7RWOc>QuY4ItNxUT(fgqg}HK}W{bL* z!fxNP(RT<`1-UElGb7tYfteAsyRkeYtDB>~vAeZ!Lnt@R>vO(EL)eSCa--mtj5O~d z-vkpcyuB`+Y)v7~1H3fQ5V*W1RUU8ea9Budt@D~zuiw>@x$w;XT(b%{x#ok@F*hZq z^NLJfc+Gs|t0sJ=`pr>&e=E*%gS~`Y)8vx7CPzn54Hm2=yYoY^T}trM$3mf8axd0C zyB)iQmut=g&zI7#&eM7Es4g3~9?*)yc=$mUwWOlk^v!r>Xg^c z=E^Op@6AMbIH5xB5&!Jl@K)&WY>R{5!>y^}?r);$x^gs(Uix{mb|Kc#E;b&$yR${p z`G$2r_Qlp*6AJ$E_!&DKmOl3GMtjgu(B6*4kzj?>mp^36C-9pnT$jH=&4=&smGmFO z%KnB-7>s~hCP{cse|dWsS&71TBFEmMfR7jK(%jp`W9WsC)zV!0ikw_50UWv8D1YK% z`08C*bnHb^azcgVS9>%u=N)?rdR|^#1J@~c>^VlQ4=;FtwP&&sY;P~Ow-@iJC!Km2 zJ0mp_rwi6sUGiF<^)lnn*cgfnApxriS--WBd0&ZzVmNV{Y|!o-9A}6iCz}|%z&~Py zNx?M_^2B-q|DBcIcA2f&5IhP<4liR&oW8ZC2~;oEgOgn@M2qe3TVEn*?wNn_T``7ormOESm0>yRjT)J{v zITDQB#v-|>ee`1YU!4+(GIG@N$WpaNR{LE@@NkgShgM{EYi14DS}hBZM~fZsd01$d z+8wX$^`*V_CRu6$Q)5)~t0OG_8xslP_f)Z&seYai9`Lz%r04gah-_}fkl##hM&QMf zSc{6Mi|w1i#1eUR4-WW9QdL~_R5920i6=|#+BF+R_4>;%OI{hNOP9X-?=>oc4YA=P z3^FVcqT9=juPMU;YlE2KQl2}x5ZwR$YX7*ca;%sT;{4V}!@0E5jm=ZwP)s~~&WE!E z`k+;?ktlIZ4xz|TB0uP3LuT|Hj}g^OlMwdo<}2?TixzAXTYfKiskbQ2PcP&E#)BBx z6hZaQYCnnej}N8oGDagj7WthlyNltcW9X@V9ixal>}krz;gRonq|NDd-!g&%=gN9- zf+Kl0Ls}%EJ%QS82nc8LQk6ai@MGWH{vqR{$MCAxhOUyxtdG38Lsax)zG|kpw#G+V z@&@wpVf=8x7J1gBw zeyg*-yBpv7y?M`{KG>F`b!dFRDEBsdYpI{dPpyyNss)os5pvL(4xZlkiycOg04;$XQQ z70Y(QWAX!_Q7(>cSvd5*SXRaQt9rkVqwXsJQS;@>-l90$rPxa8K`u@*l#a(jK{^#*epGqG@c{_riRn4h>+%k*(-wKK;kNL>rT? z!|R0+(w%bq(%k(vWRAH0Xfm{?EQ7vC_rqylh+h5K1ngSUq= z2ICOXvyT=0V?g=2>}DSHr&7CNw|9I1O`R?Exl*sxmsm7Rcjw=Yc!XeMIc#b`Uuc&i zwWEmrgJUkZ1TLpEnp;o=%Ln76d32tMe17a8eht6lrayvYJa`m&SHEYkNjM415G1r; zhj6R_b}-7%&V=R=<^^(Eu!vw1nMB%Ifb%cDCSRf#aZ3F8P6s$KR1>yoz)H`n;3ERY z&rh6e|CecCo!Z^nle*kg?7XdVR7gW2gI!Mau3j-2ylssa134F6U8|n@*5^r1JIwJe7K; z8}lFXujM?7iLeSn-%f#Wm-8H*3Pkpw$gBm?P>Ix(* z1P4YERKjs9NRjF7Z(kHEY@g+Tih(>hJXX#6*p0xQO~8+WGKOgqo=`T#$EGuo90Txj zpn%VY7tJKO`uS3^hH-~OZrW-^?GE3sC*dRH0Oo*F0+_uBmTh}_K<62*065R3R}geP zq1>2<;(00a)B(-?Sb816+@k95EdarI109$oy%+^;)qRXE{o1|>lT0_?T8w3+(xS$K zQmT@eNrK&tP~oF|bK^a$SnMFh0~`_@r1n=#KBIENtMrUseHKee>M0_R0YNQPFKhn$ zNpMgTJH*MdhcM}AXoVg1_I7tD8jSs>9)3&nHvKP-N#F`3_%Q$Ab(@bxCRYcZyRr_W znDvl?6#e};?BDi~q#&e9223PC0_eL#>(Cs_Dt&cT{ip98;E+m^a0yvtwjuRLnK|7% zcAhzCdkOIB>);l|Xtgz=p&!@9VD}kdmEuTFigOj1%h=H#@VM>q=Nc&@2R|VZQS{Py z6A8T#w`&iH?DiEXf5_z#fJn;i4cM(^kWbJ8iL?9IU*5k2kPk#yPL1TopYL)2lk1bh z22RFn=`~HU$qSLd4HBuvIw&`&ZgC?PD_2PY;dS0Q36BDaM8}J(zkf7xiMiEX?Yn$m zG5lk`F4xb^J%Gv8edR^+fiWa*pk{6yMfMQv^@d8^e$4a7)8jYm5;lU>S<8=ne^# zP|6{PD#e6sR|N=a+{$Bqz<2V){a>$f5U*j!isv`+gDR&rLsphKXz|sNt+fS~e;o^8 zX>ESzyJyIeV}UfXH%bx|ajP3bH2~7%%;yfqz`*-rV%O|$&NL89^14n!2_U`kh~T(u5EP6IXjUf8_u+WHA34$1iYNM-%xdk&BYQYhEKU>e*C zu?eV^_$mh6-tp+>FuFF9om1LZxwsk2CClF&M!{N8kL-#(kGp416@`OEFxbTeC4dd)IY1O<;HK|pV$TURS!zVh3c;s89PHAVE+&rorI z5G|M4hY55EzCd7_On_f^!$76`af6}(phUQ{_77Sd%6rrimna@USgz=qq8HAHcUsH? zI}Wknzc2}H1vJ#4#X1kqLY;kcGwGz&>G3x)DZ-bif1OY-*0Blr?{n(=aM&O^DxWX6 zRqFL|mRR{~x4#3c!}VV{YbQA^@Eq$);}3=k^zy$K2kp#^1e4tSdBU-}2qiS_l}L!> zP#-`m7b~1!LLqLSt*a@{;p9wup6UR;v~Mh#-PoEC-!7EeDi4O45G#~HX8D&dsemud z4-g2l?)%aU)R)?}G)YnV{LB62K-xwX9N64q@z6+d#04N7n4GM(t<|}$KC4^G=lA`i znF-xeDM$s6xh9u`-3BC)&&^i=&;8%=RapCJ?;H?NU6X@?n_{Ny%}u7QT-lvS*c`Cs zOn~x?)jT;$*W^xP%>$p`@j#{N1@WFO(Vkn(!L1CXi23+pSrt$KV`)g15vWej*oyQJeEu?*Bcv`_w(Gv6an zV}tJmtm!E$=`II7%U3X9$?<5xF)|xi=YgVh^YuCw;H7}1P(I9e@fr7%n&T4L`%Yqm z2T3R`q&3JY*<&RDdwRoj*&&dxKb=MA^JAYukWy@LA ztgSX+J^LT|y176CKK}LL7VS(Bx0=!syVxTEQWN3q=6Z|Gx`S4xd$`mSN%ssKJX8<- zoTmaOtlmVsw@Zu4%tL@o5p`ky1us^yXKiBx+>h>~i-~uTzc(h9ZX=a@6N>ntX&L>L z${tu@I=fV|j0P$LW3uA?B1kR?A(1J(g5k?u(ZC+@n^XzW=RiLVIP07LMf4!RGysYb zY4EgKx5V+R4Y+IlG)eR+vb?y21Ov{)3~_LvNfe1=V^?Je zgbyxAe{ViY!Kohc^V6|5`~BVv%$-+Y6Pi=&>O%<_M4cJ;Qs$t8^%?leQ=(bv2Q2;N zfUQJOF_hR_c^O|@Q`#Xf6dc1NL6*A$m>`wq5WXF!bwXMk-DH9OP{P?qw@yvL?7CjIG_2Z(&vQ=zod8IK2Q>Z;LnJc3Pcd};?0Cc zA;oB9$VmVD#k1+G4}Btqn5U0&x`QwsZ<@sRwwK4xA2`c#t6(CY?_VBCWx1sIv;06B zE=k%wT)^j1(VntyhQ=uskvP;K&`JM=LJqR)+-8_6OrA)C<$-uDu&@+dn$f@9%VFQW zl9(0ue!5q|`9}&muMlO4MFhBdYvRcV(B(~HIM@CUG3ehW9P5Ba`nym>MW3GPdotdg zulp|}(gP!+o9Gco;NNplQ9NJ_K4I9`B6m~%O$X7xzu^D=0Zm8{GDs-lF-b~v3d$M= zdL?=U&ow{0Hb{7iuP_M83~r>K0mS%3ic8I>695c_O@XFa93~{rzjR>C5mh()sZ>jH z?wLK5loHdSG^YQGNa9eTTes=MhCqQ1p#0oTd0O7tSdJ#t_$1Cjjst#2P9KX{{7dY5 z7!Dw^v1mHMr>CCELzF40g!-&+AOoXU3$Vrjcw@&lcp`l#Vb;$g=ODHk=Z)^tl9dj` z8|`_vRIpDgSgS&y4m0-zp!5ct*=Gc;C9}a!{}t+&#{i-K3!wC!|AXxgS5*|SuiJAK z!@tsDEGrl)-Sr;6{gVZHddw_Bj2WEtUtj$7u~R@>q9_VB*%1^E z;+$kbJ58bYXSmXyf#s+iJmlojVx~!Y4?(8Lx`Zc?XHxET;a5;rhdkdLQXm7Jx94>` zYr`xo4_t<8e(l&nHd>Ds4T=!)s`%>a>i@*S<`B7uixrnSHxa+ZI>og(8qOl((8)es z3sRBD!r))I^FR;4K#N@d%xhNT-w-Yt3PEU|E%^vr>T zTCyrX6>;gHC^2&k@5fXZS5;mp0gcR3BgHQSch&?9CI2q;($LWa(ah2k^berTs>1AR z{a+YK6xW9r-V5u>uMKX$9^a2-+Q*>uI~#hA3!G-ZlMA{U2ce;*CKi@^2e0`Q@R26E z`?t1^htklC1?j@CVotjX`TAa67sMw=KqwS>i?#J9--TfKf7)tv)UjAYzSXtby_Z?0 z`H1LCE(y^VEgPA`rzzz}K+=*f=4$Dour@yk9bl&aqE5}i_n7T-?KSa^DVhg}7t^v| zeg5%<^-)jmy#i^%-A_wDG!H@@AipDi^aGUh_WU5L8vRd%jmv_vQKzw+3h_f)*+74x zH~cSV$?yCt0{_!SAe*JG~&~%f@=t*&J50ELDg@`QeUbQP8DfiPl$lEd?VVit5 zKlnn$=DTIm1k3>qe4kgfpQHfRixF|A+43Scmj!&f>B-tas zCI2@l8NCTk6lOlU{`2N4x?L@M&5E|L>GC0+G+@X?Z1ekPQ;jC zq>*N}{w5csiuPdy2U%YhJC|evJoy4eA^;gs9%YSIdCk%P#)v1>%z;sdOnBbs?Z;cP zyR^2ZAc=xLNDtIZ@YlqQ!u$B&uJaR3pG7zmh~7k@qei3{E))#$GJ9Xq$38BF{cnv4$|s}~`kXh4gY8E<0I$Vt`p43=nv zWAk>Bl&{-#DD?URK_{pYlc4_vWNG+8g6w|KS-a2>Ml}8k=sGB3X_i#08R^4y$8PKJ z(5%-fNqa**2N>ozHD_!9dJ$VW&UUeZL^h(MxJ$P5=crw z5zetgQ5KC=FK|QHu{)xG>gfmQH9$StBrpf1jO^D$tFha8)gZEkPO4aeGdEQKflyp& zA?shipRJa_e}hIq(|9XWJ~(h^#*FLwL&Le@+C6AP+?ib5_?F=Uaaqa-TB6cW{lO+w z1Gtug9^#T}eq63rMj)`~mkS@kApn|pu~K>cw?gIuDeiDI+zOF}Zj+cP1l1_&7vy}4 zeJr7xKpH$6$j(uYAOwXx0(C_a^Em{{{RxMl$?*{>Pp`4N2KY5%4r~*&A+8Ns-w?i} zkdIjHjTs;$@-}OC$$(eav4KL~L?leDbM+=JPp}m%wHAh6>S`f0uTJnostZ*fQ?9x;gi~8Y%C0p6Pki3-Sb!yJ}uZ4>_(t^30iM{0?VoXNkQRCLX?XF$i{~aeM2> z)ZK_|-84{;NVImkfJ7*{#7us3t_&RQ&(0YaDFItmn)}>5*%Pe-mj?xmfjYun{)V#CwqH<5c{|Mx--`K!|Nk{fkWc&4KkZ zEwId0;7r4WZsGf)`Hwxo1d7}N%Ef8Yr2Y>`G*c(6Sovra{v%uIM4ICmAKz4}j!z1m zplwKYT7AFR-`d68?&!vMj%Gfip{FwL>^HL5@SM)w=6_QLd^#&OpnG9>!el4@cA0?0 z-z!mWMYQO!|1#1@g7|@-qzF3=1-LQ5HPENrhZI^M+0>1sQE6&;jT`0 zJb?HZX)e*TSkVMPa{vSt9`w0goV%YBJU(Cgn>>}90u74DfZ~^{n-Dcbqj$${2n=0m zAhJ0{0UVB8=6}QrdD@sujKNnd!kPW-L#w1{{`ysVl>0;|H2nx%)VUY0gCV`Jx+Pr> zehzJpJ!&J$?bl9*UW}%1_kah6jxlf9U8G!|Ep2BOw0#f1p5o^rcQnY(SWd)q<`G()_zRw#$u zD{Lx%i3R8ZGTcjVvxI^di7xxxRqHD|<(*^439ioFi+S(e%B)5@2DC0vOZZO!GCC0( z%+de#3%+xZ4skNCp5#8!lnHiTySKAo_dd~e;+_H2;Feb6IHP_H!j6n1T-9!xIrR{{MwFO5OYK@4Ft*%AX+ zAn<9`x{mTSTrz_^pzFi%*R=qCVf#>ixqn@qT;N#@i_U)cI}0==+y#lNh%2GeZAk+o z*Q#oqhB3wggZ#-CQ^*aUSiXje>^Z2ry2FtCWvJy38r1V(%PaQi+hCL?5YIq&r`?Ca zTJ{QnF2ubQdKy-Q0Ya3gdigq8870?)fIW#cKT|MD&<;flf5*i*$$@{}1)zAl{8XQ9 z%4PDPhyKvpw2+Kb%6R=qGga(~50}Kgl{(A;_*Q~Qge4}c(3Mr%W-5$P~cgQ{fHI3 z^8REK`}I&g#Vt4}4*Iy3FdWvsQIJR7Tr<8SO+B{r_k!URYYB+^5ozwUF9@s8UDzjV z|D$RL3M+E>8F1f8zeB9hzw(jQ?<+>FWXD>87mC7ObF645--Z%y!1r5P{(?S93Rg7k zNw!u^5U6LWVp=E$cdxcfEvq}0_}m8C8;EJONOzcLse(Aibv@JoVnw<=0!x&#X?xz@ z<=XqvrPUE>bh+$2h;e_B3xCXbs3X|*3`SWOLit!^w`NO!@oWE)!2_+^sO6l$K>lu> zknqp81^(3cd|!|Ba^++9_-MjoXua_ZE27G5MsEWosfG$M|1v1ADNH5O_YEop)8chd zYqBs!8px7u0P;neFBPy_jRM}#OznpxuVj68Y8wMvmYgGDY-1RWa#wayfG#FJ6vv9_d0){MP@C|RR&da{&nfTjP22ZHKF z3f2wv(OLDQg*!x*M~s}FD6^SXPd`jBO|E+H4KWNU9diWJVFow)W&WB7`C}pemfHbKVg6+laIK8O#bAKba_Lb9xPGA_ zpc4QDWnk`iyb~*#>!pG_q3Nt$&pI|D9{5(dsuQ&gUEIBIfN+3IUH<+(%ES2yLwLA| zloXwxGr@q7d3^}p>-5VWV?e*Gfec!yhbR;?T0+?N@CA!80Sq4R$dcuNpKV!0@;}J1rG`K36jhI>-wkBb@1c_zvmq%RZ`e6Wakz z^9|xqs_k>WimZj-h;L+%zd|&`U>Z>>^*SpcfahZV1bf(^E}{e-%oiYKkb$xtnHvae zhDPZCWB}0AA%R9CW2Ac2uVL(a3AHS74Ri>htDb&Hs_+YOFLTD}n+d^Q&u@XWrva`7JP6|;uP#T)Cp@6Z)`k0j$5q5t${Aiumk5Xc_~p`8k^H4Q zO7+vLB}F?R4fHmA$rNuKuBCbycY|*Opu}mEKKeeriiTJ$G7uv`@o8wt z69}+99dyXbo5J($LA&n$D&I>`k_BPAh^+C0yOby@`HUF-P$-KGBF=_`Z;;qUK-p-) z1DyO&E)YwCn`p(T+`b~zfvOe^TpW3IE?67amkgn+rQBdILC0}a3EC;%qoFi7vFjZr zxv$1&WG$4pV7iCS`RulRHKUtC{ksJgn6ZI5y{7SV;>K>O%eWbGLLhgJ9U&8Wh5N3x zqK_@N_Sqhw%zEV*Izi5KEpih*?n2X-64v@5?Eu55cTpiRV)en4USA?Ac&Zt7a6#fr z?y?NoApFNIQoM!FB(oL~llJC>;&`LWh%WI`?>?lbERp8z7MGFC7#Yk5h3CiiXm9p>yauf0Up2l z34e6=<_qY?nM(~7jFB{%UuNHHM1yNr@@k7GaM^*##_22OsHgpHIiDB(AjhNs4RK_{ ze*)lEx45;v#XP`RrlDriRiWpIPQAj}B%3if_Tm*#P$CUZhIc1bUq->eB}~YzG%>oX zH%q8WU+ivvYV)psY_d>))HE|Px+Q+e78kOPB= zMMMVGQ?s$2tawFJ-LW|6!bD_WAmR#p7L(_yN{nZu^PAogXq4DR2wEouKDi}z3_w(j zMY=Z_=RmI(jx|jJ}!OU?~u8mwBJ%x6j&R~;iN2zfYQ zCc)-6Vl=Q!=!hc{sE!|FkJ_>__uGV&=QEpY)u=o%<^WY_OQGRR!-nUw+#k{@QSNh- ze+Xod$|=_s3SqSGYg_$5$(Nll2IuE<<^A|(sU~7yfl@jJ{wupZN^)P@=N;y)Frq^s z%#MDjmO${_`Wis6+E3CLocu9mJ6%`z8z%qJQ5v?){Xvgf4#8YObp7gn<@&vo!Mt1n>im@ z(fsWaPn);RL;}iJMf`2hXEZW|jDChh3Dq%-u!{ev1TqT1n*HXF zDeM|{nZn9i=+ZV!E-{P5DUxty_`6X8rO?!%c09|GJEWpI+&l60QDoS-`l7kGKZmgG z`$SBV*TLA%Hj7QNyC+vXR~XtHJG0*c|4i{z8#q7~topAQ7U#X2_3$ozBp9$CL-gqO zh^qhf5x%=_&#`NF=3I~f)ZNj5xwcrdhbU78=3(GL7wW~5UirS*Q3(4=>@c?&0RaH&P-4n0~pBdo+B-Bh)-B)kD6W#dHY06>wJHuEQIU*KAnL`K8HYFU1xB$B&HFbgafa1qmC zXQ~6LX|3D&1Frq+P%t<6agt=#cTGHJCb;er+jQug9qhcj7326A$)70myxmkJX znZ@0#rwJkSNkjLcgufD`bR8WfSpm=qb^+kk=Q{$h8vory1KaxKWSD~CI`hzQ5RB?^ zj)m$q%=C{xYCs0%Kp9mF#RIU$>Z!so=SE}Zf6|&VWRQeByxtkdBIfFJ)VGp4Y5%M@ zqka?)IMAHNOP~{0EnNgoqiKz;5485cG=F~XI$lThodd-6(spgBCFwO#BSsP{rZ+%> zjK>XgoMfHFCd+Wd#aRS;X7n16oil}{Kx$Ig>5BS42j9c=nU>2}v=$;?^}=;HoUe%0 zf2CKUNOfDgPu(2GiNwbujsi=COcp;6Y1E|Gl3Hqnk`+Wewd)sBc*E@khB(>ZhQtsk z5TBWpnw2EW0h~`bj)1tK!Y>m<4j72erTWncOCLJZX1B;AT|ButnIY4helx`^gkb|o*242bbXW@=#4l68hX=XK~R@w zx3RO6gT`CRnm(I3iVc|rG$dg%4P}10d0c{WNRdT^tq-BURX0ORc((+CHq;E=&0vO) zEC`vLO)0^Uo_!Q+Pcg)jk!%}luY0j)GGN&vW(9Z&A3(2R-m6WLBx}-zmxi5R%#FQ@ zWpafp5t@!Nv{-zGq14CcAL&ou=O*_$jN8v^3`!FV_h9bl_encRkiPJu>4uWjr5>28 zK_&rD8n5T@vuie1+))+6?Lo*5K+qQ7Yd2b5^oxw?T#z9?unejxo+#(DlOj&}uJw3a zjd!h!o{vRA{#k>$Y+H`+v*5PP$K zF=2mdTpxxrl}i_rogW)dno)_Q-Z#`wmfV%I{87@SV0g+D^pjz__2q|I8SVoO z5XlUA(G!$VoY$xH;N15>0_norgqmg*!d+;)Rg4zQXThOELGAjEc`^7M- zZO|gCVG$JPINOkxm*UcKhGyK`?dX1&0i5z?E?4;}ToPZ^GiUca`2Ir3K}JEvLV56l z?n`~)dxT_}xCq_X%9;R9Ef{v@H7r3@RWtgMnG3P}wa;cQNP~N+MYoqO{PmcT)4A2H zrm(UpkoBq!WYP;ehV)JY`7KB`YzW<^0*fhsL%cNm6Ru;8Fm=uWd`j?JXx&3?y;7fY zSz_uW<2s6~tz!Dm>5{`}np!DRo(R>k;x- z8v5bKpso{esZEKtn2iB`M~cAZ z6`)5v1842EfY_I}K()rUJ#bLtJwy|bY>o*hRyK=&OT8QT4H|yy(7~ga6R4WZf_ZtM z;0zk{X-5l{Qc4s8X8WE*ntK)Gk3-x$Ui3UG??Hy|*Q7{kp?YGx@uph>Ic@GDAWybpW2W3WRX~V(&DhxkjBG|BG#(1w>XA+-XND}%)`YiWA=TXtYkbT^dk3& z#POCCrTu>*7&RV;nUK}vkmWD&Dk+31t_R1` z+Jne4|E*M{$hj4y>D$79kBr?x80g}99cU)AxjLvLvxhQJG_0#G1!2DVDQoxxR0hh23|HW&7L#c818L#J z301>QDM-+bnczzWaK#K;qNe8!zH(I)bEctpGH`2$hbGBo1rq2XsFyB<*}_oFCC4qy zid(9e^^e_#`&Dv>+W|if17WJPRWc%3_*e_8D?hAM$Bdn2sOk245pP=2*}ktZFKFP% zuQ9E#Dx?BpE32#t(7EVm-#FH(aVGlwcSpxgRUedMj=Y3)F>vMp(jwy@zPR@)tPPo> z0_~ya&Sr+eZg(^RmqvaNvSDy7JGqQvTOPnU3{L%73C&ZmZ(KKZz> zr`!5o3D;?0b8hsGLQTLzu1O9aDJe?ElYKEeMMqWkwIt=xd`8u)*KmhJ=M~M<41CS8C-CI=d+bFWPO~5U7dZB$c>rOZsVJ$$dw4q z70#VBubQTPb19W>S?t&YF4g1q1%YA_Mom_e>m|c@`%oK!Cr66aEA`2j>g)cwGI^57;*gAB!LtP^_}i zH!M%)-^$K)&y4`ehK9^^OdCzfaCiume?TNq+d-S-&f}c6N3iGE&SU3NT@^2CB0eOo z8?NdRVbFW&+!$B9p17xCFqAAv12(M}xd4hHpbA*}O}k?qb`1sj{ThXWtq3fRx<%C~ zE;7>Yw2l9FyCTF#c4w)BLM!t%3D5pGpDWfK0n%e_;+p{P%ykLg!PI*)0fj>IHx*jntSO% zN|qE{57TJVaC}2IBT?%IPgZhrmiD%pZ$lU@V0zxMF$Xcq^bw>Q19(UuLsd?M*FL!h z5-Z0ud?B7d;*wCzAq2~_I6Yx@ybu?c?SAt4r(sN*3Qx7siXpRU$fOs^r7PpQEx`?WPJkLGk0+b%ceyPdH$>D*1MaY3eh;~fhSxCG z$4b9ik-lXq<=CZgxHoL*^i75$z#!}3IX8S;A(zmU+O2vR6?1(SCG%lx&0HVhs9%R3 zY<5kqtdq=Q?cP=`hucmWO_FR-;K|~0)8jhtFBQ2|#1?fy)o+*X%NaPfINESSQ&3go zJ+9U^a+%lCjW~1hFK!_yM0l_+tjH#B;edlkP>U5*7w~eM+W#(4{t_?eb6EBHYqGiM zW38x)c&%s96ud9TX@-9>xdt>24qMr~E zUnCKAeFW$``kGyJS-$TA_KEUseLp{^{T?+wSl=EyI=HbEpY<+(B;Q!sjCI9+nktGU z-LH(u{zA2n>kp$@oK(~Ix`EJ?vs`_D$NC;n)|{B>AbnwDWglXtl>sVeDPDWX!-%es ziF>vxme#c~f-EvVkKW5FqUAo)=3G?suij@6{#<3cVY{FS9JOppUutMkGi`a}6KQ%-Pi)&p&* zF^P*IEG+2)ml=c%-q=xcUp|YYpa!li0LfiU!F|$GhCu$8_^3rDG`ztEIaidK7u3H% zBcG|Zxhy_=;Uvrt`do#NkNsWtEXR)z3mVhPoES51MCMSVp>}ItC&z^!F|o$>j%p># zACgP{OA`dGPB07I_F7unvzx!gmjR_5FIeomicLnxX$SnaV z(!l0zXeu3z0yq1ODo!pQ7rB#dbn06p#V9n+`#eTN*D@1e;%_clD@Eio0!JkDr*ABZ z?%rPNu4oJAx}zJ;pT?JhDdqXLlhE|OLTGmn;c&&Bmm0{uYdJUXOz^2Nx+=j8dV>tc zxm3^|tixR>-hf^5Y%a474EaM-pFCzi+&D#n6`33%qNl9=4p)T;KzG$gjT=V$-Yvx= z3Mv`{b{XuyVrsYx2WmK82sh#s4In6{!yWGCNKrzr09l&Rf27a%8=a1IupA7W zl>@GWK`<%i_aLH1ZfJoP*J}o~Y$;HhlnD&`S+sH)m%?ou01Jl^j<7axqUGLrk2d&N z1_j@Yk;cFyyxiQ;nAsTy>zvJR96mrT=|}^}^zJ^V@~}LiAiiYI5nB4deK2Ksu2Uo> z+aj~85Q5&kfMM5Xput&+*U)qR^wehkIQONk)@{1|DF?i+yci4RIiMPLJ-DrJ6Id)Ia+5-%77JwO$)@R7+D5VCiI&X+&i391&m z{%})11>r54ToGq~wXvj;E3csC(VikI;5zZR{PVGnr{=G&E4|y)Z;*@Bg}cX+?0L$1 zs)Tb8nxlCx7H%_uP z`@92zd!_Z;J9!-J@rLanOiYi+pq?Z3U=X!gx@bQ)V%VS%G;H=Pfh}?C+5ilxua@=eL`jq_Ky|!rt zt%;CFwg%6P5dahVqao{R@6x9IowVu z^*i&8tXsV#6KP(r`*k#ONNN1EK0-rBiqG79QqQov+{^4wC#|oLK277*-uD=?sNQ=wH4^+MzG95Rq(t1lL_d% zC`G;#@m9{D+~5%7BEVV6Ej$o$h>kg+_N8~B(l359i6Xa(%jiAen*VBhIb+Yb#^2X* zE37Q!B^eGOZLHrsRAvc2kNhS}p&OgCid;6N%W&hL8Qj{er^341NFd6-) zDn*J4cR^Sy;EZqIq0G9*Yea&yFeQShKKa<3Ix@<{2(ur$mJu2zgV5;F4fmOPn4rUOqwA^9(pbsz z8xWOtKxSu__CuN!An!Kmd+9;&0HFazJv*=1{P!hJ(T0%BU1&4%NV(KnE& zsYzc#u8jg_P`6)E4WcN;90>ecfmkMwd7U42BnMIRDYb*O8GxRuBXP-bZ0qyP`jDmu0dZU08 zRHr5FP|U=eFsm$i`exnSTlUpa?B%8J)+K!n5Qn!JW>45Bvxs+CBjH=I zrZrz&Q2MKi;meA;#o25VlfScV|I=Yiz@W5FZiWS;C@mF$w_L6|)5}W|m;82s5CbAp zpcV9WgbC#+;YFGdKEf*~h=MkA9IqzHwG5o0tc(;TZF_e%L*W*F8;~B-hT0uTK+Z7< zaltdq)P)cHe^Hp6v7U zP?Q8@2q*l6B{ZM`RwKRG2B5n_(kai9#cN{Qvf~1)*oTf1=gCZv7l0egfa6!pLRU)b z5J2L^ca5Oq4gEGPu(Wb-+6JZ#lZHw`gBP&B6vkIHG6A*enI9mLuvUCvv@k+3*AQzYZ31!hGdW5Q=MnDCFhw zx~Jr+{^c(rdsX2dq&KxeSn;O*6 zSO!3=atFSFEg<6&FjX`)Kh#u<4m5w9I%fP4;8rKyA*%A%C5^>mJ?epq6w(AegY7t9 z&V*d38mkeSzvHj7ZPqBmFlZI}85;xw5B!%Jrfia-{a;kO670JM92Fg!R3uIGG&2HdBS`(-+=O zG*AEs5?UIGspWQAKqJUNi`+q1ual@@{~TQWw>!eH9nJvGDTMe8BhQAfz-VT2!Bh?cI8J1_2#vvK8B!M-;pPrC(6e<+o8buC)YSS! z0+GT~Cc=P!UISf@pP+ww{ zc)l}7WPcAC-tr{e<5F=1VXEt&V~YK`a*6U7vx&*~2x)3Pn<69-t4{*$ZfOV}{L^w> zMC(v6s6YcOZ!ieWk3dL5`d(}u))(%3KLt1geRz0-*2F%44Wnbz_K%~T;Py|PCV;R3 zL8@}C_|zEr>1#OICmx$$O$coV5Sgkwe$;e0asAQ4iAsb?77leBnBp8Mlrz@9vnD{% zUkGdZ0P#skpgJo2%IkSs3=lX%=3_ILDJc5)K7<2o)4u~gQ4el~Wl2&NtRKgrK-q#o z4IwF*!*{R5(22P}hnAL$2X-E*2FE6BWuMGpepyH%oSNIsGQ^jCoHtL+EvGgb1*I?u zGSJMUtD&HN*-m!Vl7~KA5J+Qgk^1=FKAaag$5lu%2F6YWkO7SL2QU(M^MNh%^b=72Q84MHosJ0?3b5k~TNZ@(t-l+6ZLPExQLMgB$K!+aQ1%${-jdaPgB zy_*O_WaL$_@-a>8Ac=2 z5|Ap_;B6I|13F=aknfOZ1kCiPWyfEH-ilb7*lMI9+)^_<3(4vf5f6sbtcNV1}5!1(|)&VyDmm*Wv_xCdDh*Pijs&c!^P8%3;_Vf*FN0;0R&6 zh7t3)se)(FBK2GT@xMd_4UiEn<3(Kp#h-UEo0&ymlv0W zMUhKw#cIn$Cqfp+1w@)Dvv2IjJ#V?X)igUO6EUc#K^%tboPc0ys9ujn6G7Mn@BjfS zZUw&Yuo9NX><#Bu>+mPSl;$z89SE{@EIL#eMvfXMpo2_vE8BJUijbckuE4p+x^qQM zBvXO0%TT8ab8A`ln2~*W=<8z1Jq-}+A`B{+t9l4} zu*3^r(671#ZYX4L?$<)eWf`Q5`xHJSq_svJ7ZULsV?4X|65O~KI zC+tlVG@G=|${ED135nKgU}}3*n9?={U@VwVcS^k}9fD^SZEhq@mY=yH@B-ZAHt@p7 zLiz7t4KzcR@jB%0cI5Eb@Yi{YPpT>wzaL%XCgZoBhT`b{d^F$)Zap~K1;gz>)63uf z!8aR))u~zCRsQ79R_~AGqK!5YHaQ<4-9YT_Xfy3l)Q^jM5GoUR6q%#p&mh6gxJ7}h z3$b7d@`V$EHnT_vMvI~iM&Jqsmaqv=+<^3aiT}*iUUa9cq$MIYU)iuIvIAM9)j^D$20V#GsU&N|4=}xW$l!T`Vcsq%c z$egN^35b=lqW2?pY{^EkGszH>nH0PX917+r=7fw;|kr#cGUV+iRvp*_xHYp;Ob zf#JTbb%;~Qa1+Fw(&xvL_3FY43vsSsy*Kxv7Al+=!JoK#V*^lxZ*9Kk!_b?XB}aHJ z@NDd+d18Pl8fYr~4*yPoOAP=6M1WzWGLUrU2nv5g#vrZj{#*%HTdhd9GH~qMSh`j~ zy2(H2D|YGVe!T*dxnN}VS)Nq8 zx4Us8Lfpqqa>_mOBQk+u>iUUpgy~D|#8QA5wAaPjl^qsmg;=&jt3Y~!@yp3RWrhpc zHadSb4t{KguMjFI9?+ro;fIkT)uRF+%dHk%sC;r8@Y{V;+Iv+)B2eIWHz4Z3;L90E zT4uv`AR9VP9mNT{0X90kf)X@Z>)p4jnC3)HJQa94NudF-j?{{PTz+=a%tl#I&;@Cn zLt5uSMTC^?Y>wm?ACRiSDr!6e0k%?XSTrM(EaVf%N3ustULr0~-D78S+_wW!{*qMR zmyOvsHv5dh*tSwOr!4{1M!}Y^6zu>@4qQKf;2ypNc#V}y!^i( z7Vy7C-aCf>y?p1k;h=xm3lPQv zNCbxM5Y{SaU*58sHfM*5sER};Xgoo7}x0{l-2z>jdym3vpM?4Pq%<|higFUS-Q zz<2CK@abKzNn1T&GP+oj2YBC&=oZmva}e{xIfxHiRbtn8Pfmf+GQUV3fBu3|*6W5KV?2ZzlYf z)fF=+O@H@i4;n5w^b<2ohETrG#V;asW|(&rFoq73OaTbq2ih@0(2GQf!+Knvo%;lh zt?&z7i+@bjN6OJ|a&-c|U5OVZ&}4jojuMQlKx6sV12L{1<&#{NRZN{~Z+Q&W;T?=P zwxF~=@y`CjgJ4ICKkjsEOk7rzt(Ni+L09?#bPY0+8vz#u3xG)cLL2ACh#LRTRUHGt zT3>42T@4q}`{=?vG{D8*3EIvhth3O|bKk-{OX{KYdyQK(1aC8Am~^{~s*2OVIuV8+ z+*X=k{_P=5U*j|Z=_;m^1jjh;4NT>&+Ia-YAOODBFxvEf3Sh1Mx6dv!!^|qo%pL>! zOtKNcOHodE-`2FJ!60`gG?x&189>>oM#%==kQ7I?F(*P>cXo@w+8<$W0U(m@ohP?{ z=$iiT`PNuxyWZBsC`A;K0oP3{biJW@x7{nzM z0OnyHU>`uLF$_KMLqkhKj)pr=q5q$tlKb#=Y>?`@b7)fE)u_o8{Na}PW)R^7{B-X9#qzf`z@MuP2LS#r9~zFx;1yVZ z%fv~3($+>4z<~{6?TlDq0>GY9Q04Xa>#?ezR3goWq+jz8SqByxM%uzhCukVEa*qBX z-0d3y?7z+_8ywm2n)y3(8%aYIMFf!y`D|-rBcWoctLN179M(Dj_uvPNk_h>Fry4&1 zM4vii3H`7Tj2GBb;u*VY1SW^6#NULb^C^kvIG&S0nZn`3-O(7|5@*^_LU8#?_T=$x$`$&KRzz zzgJ8tf%91?(p@wX`|0z;o;*hwNXoVtgSqk7;WcJ{1&rhf^G`-HA_4_EKxEEgN92Nd z4I49*TqOv`jVY(kHg#kfO}&LDxSp-am!C4GO3g_#yu{BB*PMHWQF0*MHeEQxK<(B_>d*;T17 z#{;dzL4>mVGvBSk$3I!6UWm44Zo@e9QDKKw7!3+9Da7k*=~2m^OsiNHn9A59>`2#B zbHvw%j=I!qzp$?1XgEi4kU#dU(!5hPf0gNZ&s9{e` z#vQ=CB)?&=!D67DSY6@Is=?E#@-TJ5NFcTIR_TyBBl?${sh5Qj{gmY(`? z_LWe_wYAbvxi!7-gRl~D3!MN?fbmSBPPK=1;otZLCKT<^0Pc4N0;$Uj!NB4RO^f1?+q&)%4UjYIf38(A+0YD1`FT$sK9hQZ#T7o z-Q+|*`&N9SX%S^VxYkxu`3qlJNAXV8=bb_LFS_%ARuIUr$xL4ujyDU$Z0&~^-QJi~ z{c`EsNjv__f8-+{BzvDFPos|X-JA`4S1K&gSC$O`%Om=d*6fQx9aeJ@sITa}9 zH{h|UIX`~{kqjp=s4-sNllxHAt>;Tq; zKUte|CHaoeXxpdKy3~h_Y>BQCf1D874Xo}HJIez#LgZjdIx&!q@xwtb3XViZ8(S~l z(Ly3)#l7<^K+F!1SlX-eB@CiritsLO^?p$Y5VnK@%mHM4h<{9i;P^6M6hJFWuYc5J zSc*CF=Z68pI6-S`^y&`vdU0Y+l#`OqEWj^>U~oM4nkj3TD7DuP10rApcu;M?IBq(p zZtqBMSZ#!~PON^$w%*<$*yj9mO*&9O1R>ensir<(@v|_PLfzfHnDC+t@rLG493u0QdsWxkKNrg|-wuJ$c zK7dc=5HO7HRi0okilYE5g@6k4MU66UXjdnu&@|mo%^y8d{!GKx=!usrb4L0$4hmfu z^q`@DNmuKcZqQc00wYLQVT~z~HBR9qtH?CtR~E$GdHY zQX@j0)VwjAy8}}Hd>QDhaBoyckv%raRVPRd?IAS*L?(Uvb(!P>L~Fh3 ze9PiQ3+MS#ozS!?cLFCpsU3{jA?XF);?CckOV}a~0^Zq@7v9QM?amsVT&M_O4Sll; zOxx{nISJ6RW2yTkjQmp)-V1SfzQPzt?cbS}9D_uE3-aa{N0$nnM!aaw41c&IFO{+@ zPW*59g)OJa<(49N;Ln+ZPuXKt-$BM4elwK}F9$p=EIcCUN@{LgpEwuH z+r7N%E7_jj!D}k@Ck(yJ7=t&&x;YlNQ0jHGA$w%bGq5^{cP1PNd;pFWz^3-{!iS$^ z#q?my^rF|0_oj_U0prYNpf5x2D8(^dL{mJ&YpR^WBZZKPF%Tl7=f8!JNXg#kuh#RH z`X`u{rMKw?;f@6OHYdQMMtT&=Ud1Z3X9ZkY4(uEw;Z2+7og6-g#TQ+6uo1s4GbD!f zrxd}>^J+VGH4q6lf41x>jzd@(VB&8Rfam|Lm?;>@PZkQP&;p%SrmDug8&5yg8>^ z4Hzp+aF?Z&Kgl038Ia{}Q5qo?3QX`!d2YX{}Xw3lLMFBGU*VB}7lP>JBDR=su*CRw++vVBqv=i}qZL$8?sp z#}W2{slO0gn5zn-3nk_eS=0`Ck8T5lAR_23#rMymUKD+cN!X8g!w(w0aK)Pj{%%n9 zI!2Sx-Mr+!NMlp<+F9mAbhVM1z5V7K4#Qtdt=euyXB2@z6u{*tL0kJ9>=gS7Wt~_azshMPoOm_-=AY*P=7H$pf?mNy^d zDqKkcUeleV3Jw|W*x^?NARw>~qJlsqNzZ86wiY0`nXvJe+rUq6SNZz4;6RU-AGa)w zVRD~oNV}OMlv5NA=ul9U*2z)&DYF94Ch+6gNNxZvXz2E0G{CLtOEx+}6Y?g8omCQ4 zg!XuIX4&2Z=e+UHK^v;^%ICZLV3I?i?=h)?ISp@l5LnAA;APrEh|hEzKinE9opBt@ zX{dh6`?p~9Q)0ckl(hjdYQA*i{L7)hwKxWRaZ=o|yJr?t&jxXdJ0ULGH$$%f(lZsDUdVSfX>WqX}DGc0=9t*>?GnB^3~;YI{_JzDfi zejXRmkK`ap@i3sf82ke`Q3%{Qp!bImz7pBiZet7bE|y!_r+Mx5{+zXNjnt)f*j$b9 z_6j`ZF(6 z=Odgu;Vn?Aa17+8Gm=wG`U-6#c8@VJb1=|NNRMO%od+AKXF|Ar;6R_BOUuk-dbio@ zLU%qr?nT~(=YOW#gC7gnoipijoH>Lufno{Rl4ib9Xg}vYJrZ7UfJo2;#vzza&Nz3m z2fW*ez8vwV#}U?yG$VZgI;D79UtL^QqM9x0k)rtJ=_xgnu z0&gS~3Yg;S5 z9)!~-I*mURYyW-CVm8GDr}fO$eI+t#i4))ajtC*b{1dUUQuTdI;{I=76~}xM9kU%h z2%Ivk;@f{&&%a{3XgS_Hlf^FwjIxBBs(c@q0M@+**g(2tejMt5CLtkoFXB4Dmw_s* z8wxMsbm?(L2 z-a7Eq#sGV=h0gNg0M)Ma_iqUY0|SyLVR!MtBx`E3`GEsUA zJ^`aijw5hX6Eq>{yae8gy3LbM?>O@vw7<-b_wQXqP8Wt7=drBo#>~;qj9U?UaZu*o z9#e3Bi-;D7`(8%@TU9>5`1|Rma!0iKrpWK{EQm>H{zZ1tI*6ic&?%5hFZBRs$V)c# zjNLy~I_I?ty4G^EZ<9iWT67)FkvaGh#uF|Gcl&N1=two9H2zzq@`GepUmC&ml(RJy z=_2}uYeJo2kPgc9f)AC-NRYJmv4G5+j#ZoKh>7k;2<3ZOmmS)B)NZvWA(KwH8E&^m zIddg8&we=9GGo}nwcNb>P(ugP>ICQ0M{{j%ZaQZ!4HB~F>3`)Q=z5mgq%|v7V=6^J z*Xz|HLmM!v4hons{s7f9-#bA6B69hwLAKo1*Yxh|x#~-M7^d%i45_#9@;v_bgmAr> zzVHLWB1H5Oq9r1ne3rpj6Heja>jGqc1H5%{v_%zz>WQ{^$-T%$oG@wjDaULosOC(I zo{<>kZ{=I%`p{gL&o2PlULF`n(CdT>aQBvv0A*tj&mQJ@_j2#MyshdBAB`PG6W@OY zs)1v__F@fgoV41i%{f2r8IXkZ0&l6Ok9EnFXvXc=(YaI=xu=G!#*Asfih=h85^*J)nIvzgi`R@~gJX0bErok?hWSexx5;Auu zP+XWhGPH>QoJt_gd1rghG$C(wo&%fa_^;Rf*L&Pm>Gh^G?zra>9DWq={=PQX7=KC; zqvx-H6Bmw;%0xEo0Lpo-LK~{{M+iEhXwi|7&G19F=eK7`x=d@JS8E!&6`=!V^^n$( zGyETXsWAvkpMTg&U1@L|grw(2e~Sa0!8)xHW3sWd)0$I1th58v3j%ZFkADfvmHl&t z-G@1ZBM5@3o^n$=bLl~`hF`=FUOGenTM&Wg)wzjE*RnV7;@6POh0S!;Fq+GHTz0C< zEq{9{0TQ)m5{9`Q)4bZ|j(e!U-SsJ0u1d%EDJ~v32KsDo&fMu|{}JA4Yao0KwmsTa z<$^2)Zs!vTI|-pq#yRutLe?Fp_S_v|l!o#3kzoi(p{7ue$puCwWPflTr>j0c!FTrf zr`v8%(>I4o_K;023J&i$A!P_ls0k!R`5#t$(F)!)z*xpT*7wP$idpC|q- zrJ;EwdY|I&t0eQO_u+&Ytun5f+kLl<^|oB`tgyA&yzZK??6sHGNj{ursla zKvj!!4~lR#R+G9miWK8?W3p}_`bRo{F|ob}F-WO(E?xD6b0g6A)}135IR5B2`$BB} z1q!*DLA;Qlu!R{ypu#--^AkjcaiJI%-Na3?8{BDgOr~Kn){AI4z2b9Qj&kH)Cs7}3 z+p4W;uE!NV+1Uq3+Vvx<2v>Z1_&|rtiEMuEBwt(d-TAg#PR3L29IrVt>oCQxU0J5+UC? z{e(uD<01yR*vIUcja`48lN*lqy!rHx6=*F+IeJfZtrducUxYN(9EQt&DQ5Wcf?J)x z&mxCRdOjer0iOK+gcbenl(r@c?U1N11b@l{5i`?auKLV0nD%T%Jaem}oPoP#Of2LR z;9i$kC${Tup~%EICSE z&q62207|_mDXx=CE9yXZ8X={!_zKZla6R$VUiLVG<`9}un0o^QhDU!=OJFpSM#BUv zvlDV46nf#{dJ|cd^HDT+WNGd7Hs+Aj&{8izcf9}l5XoV48C6&@x%5wJuF;bV>Q*N@|_)dfvz}J&+kr}RfIz$s1#ua&ve^eWY(harS zOP_Txrl2pFXd+QzgLyUa>lP3@MZ{k6#Cc40cBCeh=H~d|fP6a=rT$iprIp0um|PLL z_kDAOuF`~P5C)H{>oReIcFzqxdV67~j|6+S7Uv?W*$X3pUaB=B3!Qrqp2O(pj(G{+ zTrX+*MX)1Y(O#rtbM~&3#cjwaIZSj}`W62b#&@%5Gc}!{tflRd#hp&IbzsbzfsTh} zh;Qrec`0y)b=WMuv>UPUNRe4}m;olI>bld?3mS2-8_1NX7b(Dn&*3PMP+iBEVJkqNAJ>ffBH_zhG0o%2kb2DmL4{Vy}h_cz#Pmf z7=FzoGBMZ?p$0NGoxzb(V!O8#Glc$Eo-?!;R>PMNgNzzONxx3Y(7I=#_5hLR--7B; z$Zxgv2<{K!b%Lraw5-&rD_IAw+CU2i!Fe!0KPS&`xd3hFJ?wx8T|H^89{)Bq+!Itt zd<}kuz~5X!J<}}gLJ79O)`zQmQ3yHVyn`mg>SZ9-0frXiH=0qk%ZGW6EmzRdTAxb`r{ns}GC)Mfe@=v?e*5N_5ZHHd_X&vAFeWDyl2##zbVMPu; z=PY#KFS|B>%x#*Jq$1rT5wK;f&FjjupKL$spx8T3x*WH##R^fWV1O!yU_0Sqy zO8dN55&=J(Jz_d{ObC&m{Gkd>T*imIaQa;o^1|zAq=*eyTJZrY=f*ETd6fVC-o62- zQmVDk05%|o7>6=pMa6@tTgs4uJnoZ+(@rqz{^xN&%TxzIuPc>*-fNo!P|&n>(3)oZc78X?0Vfdn+0K`^Z*~LzjYOTVaPotQe;TEmllZ@ z9^e&*M~l3lL@h&E8aZW3*WbTE!BrPY2l-bfov!`RvV|S|XRQW56=0?Wcnllq-`?!s z3Q@}bX<&$;JKq=CN62OOAX>uv#4(zsvd=<$@g?TrooJiNH*(E^Zpj*dgB))HA~8+D z2~AmO4EyZm3ywVLIk*N<3$R)NrLWGPN1o`n%PaxKlbxfQcPfs1SdJ(4wU{yf_RvV+ zwMMH?Qa;tzqLVi?2!!rG{OjP+J!D265MO|DqTXA$YS=@$FCw;(9}6)?n^2=}t-~XK z(Y(#8hh2#%nNm;pidqF~l3&!Pz8a9wndt)q9Ryo$++F^$7g^Mue0wmx4W`3|l*miL z1jLKxV$JGMa{@BAUR8?mpSWh>{I_1=8htH;mF|POaPhgDGz$;j4)1m-Z8_m0JQL80 z28W$Uhe8-&EZS94)7y;`9QSg$Ggwf0YP5sq{sV7Fo=>xqu_Kr-Xf%jB5(@S9#>D-r z0QMu1WhOuc6HStZMhVFLY&w-8V(WYa?4s3MOQ3;kK(!faBsMj7SuWT$%%2WDuH{KUIq*pS?4 z2%laMjOwmvFVG!%``AHCWiQi(?KH{vo_oM(^ro1Ir4$Qc2#-i;K@?kfi_rs_;u_6P? zLaHQ$Yti5223}i|#}J{io4G^w_8R;VALPYe|%hk4PT9 z*%w6L@b$Bm_atAjW!V1RUktzd_diXHilrP^2C*jDJ}ys`Od`Kl^g8Zc)h5qdTD1F6 z@Mny%z$yL9ClxUb8P=|w@86Sq|3(lbYW61Ygl}B&vFoe?iJE-mO7UX4h;fDs`E`)U~ zn3|fZw=!LsOLcd5&vV|0e$PHga@fF$-D_%ks zzEsfA5QzS)1(j;AMFKCcFN0SLtr_C4@I=Pr4|A?AZ*@FAn34>z3El^k5!65~i~M26 z-apC9Qflu)j(kcOH@Xo1Q&fayZgP|@tSqarV`b)%m{CtAh1rVJXc3c--T-G)X6+G) zSWCtIXlO~JM>%!A%i#@H7+8;-C!q;nOan{IKW&gnH85#now9(LSn=UI70)=wHVM2M53Gc$Ya0U3IVX z1%yTi*}UUe8kjU@gEi=gE{V}}MlXpCMweY1!Q7LGeF6_tf3f%Lz%?s>X)< z1-{1^MVaz+IfF&4*9u<;wS)*e`}b|v?uWy45(nNt#Hm_SFo5gmxA0FYZE5$L9+HVP zr4SdQSo+Cz4k1UROdY+QKbS2lllO@?LCAKSTmKl5{zXwkT&WxL*JITi(y|=S3tWD5 zt-1$K{Z+F2&3q>+SQO$fvQN+q_l#Mu*DEt$D!YU(Q|RrC#HaYI^&vR2Od&MMcZiD| z-59vfktHlttdx*RG%@Y;&G@JB!G>PyO;Z+@7EOVNh6#6VjP^xA4@1uo{IKTw&Pz^9 zRb|g$QCzSJ9n=o{nm7G(8IGtX)*a_^)dMoZSowGlJR*V_zE<@D!xdzeU%q^Kk#=Sr z^K+$0h+(Bja?o#fkM#G;v==Vi;F6S-jFxu$e(CU|dD1X+l%zbhfYtbFUIkj z{zY1HR6j-1>clf?IJ{F#H4;P0S%^xxE6Fh0UX z#rVSv%3ker`xI7GJfJIoyPQd(a+DHHcA0kZtI&E|&O#4^_7?xryK$bf0N&6??xkGH z#J$TBdL3w^gUUK$**abY3vgw`5HVLT- zr9N3-{QS;tZGrL_<*}p32b&ujnwpxL8yj=1go@KnpBR0;=2n!(cBQv3g2JDn%K2Mq zX-Ns-tOhDtM$5`LXM`_1Y@cALa+rSXUR(1rwW7Q#%r+v5{WveDX4&Pp?lA{;w>|7E zdK(rG7d{OWq9uQ4Am`w^#JcUFZI0CxOTBVyCi)DAOzG1Huu1ptXP}$i{6@NPlEG;w zia~qQho&bYJl|=1ljY=m5$GU&_%fT%(1Z6>Xw$m9rTCbP@PiDNF!`Z*axC5hRvnl3 zi|J<4AsjKwQyO0lCfRVA{WQaROy;fUMeIh`DkvVDZEBSQ_Vhbqu7d09nTr;dDXFR8 z!vSP_xkK*3IQHv{`_Re^CTrtV9jAGKvvhs#6eIt7=?JG1%_-r0`;|FKS%X-^kA?9{}N;pw`UoQyof8!N_2O4`DXPd8WR zgvcZHwq{-$KS~_b?~i-`o=#4z@dYb}MEuz+&jo%RrbTk8=0}I)H&`~k^6W+YoM0@W zwe^cqhkrbke$(quwxcJ0oTOton)rFsx!Uavq2lGrk#TV{eu-=eodRq$kp>c z{a3Fs>#8-khs%EKXOJOdQL1cAG1*T=HFb?L90mSU6Bl3uV`~*H3L)+y4vU-9xlHNP zf)1O5_wY_Ju2RYd@)lLKrQVcDS3aV`a;@alk2BUp+bby}oS&=95h7#BpOs~o5`XuB zQr2BuR`Z2^h_E0|7#HuwpdC91!DCRSw;}q8YC@5gyO8tpd$M*mO)nfs=3 zId4~0Cb_=@i;Hq=1YYwHvw&%#NoTcZp+SM@50j$xHG=#}ikF{zH=QD<5@U~mQfaGX&%Ds;qhWu7$`ZdCj4*fd9s7Hjx z3*Dx4-xz4zKEx`;s|-muomNR<*S&g0K~>>il#i#2%Xi|i{5KZju1=4SDV4Z(SQ~Zc zSWdMs$pj?4+gN5-w<*S}e3>A|lukH#s_ZGfBFES=j$uea6)oR(XElijN2W?GYe<35 zn>X=V9ceRd$CghEy4B$i@><2*{FW#2Z9HuVPzJ9r$qAw$nz5m57mn8}yq0c$wEyr6 z?a!LI=6%YufeXB&`Hw=#^CTR*1~Xqf9p`VHsZYBT{5G{U+7}KtnDaH)eQqK>jE;s>E!on?nup9W6jx z1q4*zD0vfb>{M2dle;m{;M0>1nKkn~3i=9qJIwClkZPZ;qdr8iyKehX4^H|{)r5*X zcU) z@>R&;yE3$q8e=r#w{5 zn!>3l zP_yUvC0W#LtuG1K&fhI0i<4HT?yzT;QIKTt^4UI;>$EZK)xj$%E@Nd9uBIA$BP;N# zVY^za`Y}4GYxC**UMZhKg$g?p@H?^0j<&gaR6DGZyzZ>jzxJ{Dlex7SpVVj6;t%sz zdnUS=Zfx;VN<^uy+-X%KfJHT3zDRx9d&uDQ{rI-8>To2z3m`yLYYSSgSAHRl_P#9c zI23~SZTz}{z4drqB8R^NisTz^seI_DF#Q}!BgoVM#yT#eugUBK2rpSpG|RnN2ZHPZ zO}JMeW>PP`&*g57;kG zbRN8A#L$l+K|V0)5Js%jd(*kxS|j|MogH7$`3G$=@6D^mQyElT;5%q#NW;jmB_9z| z<1K>idKG)tzBCr~$@8m`?!U@BBw6q=exYk8XN&5@3E+=r%Krj819*O$wfTF6WSYl$ zwfT&%_=k9pAIpEZ&<>A#mCW4pYH|`=dz$5`cD2)0^qsG6kV+Hp!^BAy)a-ny$qTzF z?D6Vccb<9`MrfnX`PHX=Lc8gt^d)p6;~1srD1{TavZ_G>Q`rT*dq$R4YUCUG zbZ&&c@W;ygy%RMln_|!p2)$r09UlDpbbD5s-bj58fxiRE8f@Tc3EEMcg1oKnXnC5~ z7dR)B?@26DK3X*==_|X!cehZgXJ7u~>Z;00kmF>G#w1y!i#TlpgXAMFw>npMqA?*w zNSrlM_C)I(YU%Uht@xhzQ_S@{Q?D+pTYi#pz+-?Aboy}E=D`l@b>pr+Yo@^$#kPC` z=PB!)r98vGF$lZu+J~rUAnxQ+Prk*}(~)wvwYNM*R|I6v>(h6eUmwma;DL>MP7J?f zgJ`lly=UTUk)r3;1%dNabZ4e#h!R+fiG#j*=vYe%N zsd#esvFB%4E>YSFQesqfoviecqKl7|b81^NGHff37+?(wGafVBF{*jLrRQu?+4}-+?-|j|Kk{10J1(ftvukJE z${u9UzQYu-Y?yUDURjLK%i}tnvU%Ftf-fvvf_4thEk-`U*{ej@c%ORAZ{!bqz4eY{ z2tDTSFmH3fA#}3N{>F_~aH{5mWnWd5H_m3qU*?iN!uoz$$`7^A@ph#~-e}|6x!d~8 zXNFyb4L26`47EP^uCj2$XPmxLP)l3myPs;5iFUR!OHLRgS!3EDXwn+$RMwzllTX)q zitI<2@blkI)VY-UU;2>Tu`=k1h}Q@xWV|n}FLLO>eqvvCjG*mew#B%dNOU!NYQEcU zebV&v&H;}DqaMxnDdPjw@0poU3{U;T%3k6Ii`XyA%14B0{Ky=LeX;iw*jhXl{_#mk zCoupgt-F$`V_SD}_|YM{WE?aai$82#Em8V7OB6=;a2+YDXY59YtHpOq4&3OEQcr<$yz(Xh6ESiFtJd}XXn za-1#S<^{cSUgj|?|Fb(S7!n4)RLdjU7tV2&2|3og%t~HNjwW#Txj3B9bOWxf7sDP5 zv|~H?hw>+Vj5HUTTlG^*f3J~zj>47+Z7QdS*w{Yw+2DegsjTnXxhXi;<_DW;&YD!V zk)}%?3dW&vm35Zmg70v15t}-wuWCCzJFBml|FL!7dZCm|UIPVvv;swo2jeMp>jZDB z%6FVADOf*>OoIEjEu`m;=v@A>&~Td=Z(?mGIdL$Vn~bxxxcGUX@Ry#LuYC!xaww9D zt5UxRYX>FX=bWhD)V> z?N`h=al))4(p*1_nT^_Zb2LOj>;TjAvDfFKKmRc0hvEgFCBAhD9HGL$(Npk3A;fB? zZ_9fttI8}iT1Avg4Os%X^BZ;Lc7;C++IV~KcJjtDwM5U!jTNekZHI~#-a`RRkrnlt zRYhYuul&w}@Wxxh&f*8S1uTD_RBU`}{ho`Y*RBWZ+ zNfN)45WHY!G+tmDh9j|Pn^a-V8!h=`S!o+^-(@?zf3MLHY`?~!%@_2Pve|}REhSI( zL69sj$w8!^M1(=D;maKtDYdRKEwz(>cb&{?dPke?%iuskS(2W53|9x0b!dp=hw8K0 z$#-LneMwgW&(5S{=HGG8jM@!Yap_(fS+1h)=DTobzMwO6F+&nLFEQsZ{Hd`g=5G#V z-V1qsPszK8-^Z(pKcoJNGa&oed>6ou|Zmi6>okaii1d3)z zS#+KPy|8Omrsxdgvu^l|PBlvJ4j&y0byhLdYRNNh(d7(H=7NdUJd^4t3ruO}DCC}ALKT!5w*|GilaV@yc`Z%T1_oMGpot*D}O_wIGO?1RAit;Hk8YV(cgiMT_ zL`i#l`z4>wBTvIr&HEr*MeToe=}O*f96OwO#@VTj70ewEr4Ul1FtdB?TPvSibBrX; zEYJ&|(>#y3)5bv2AQb|!&r?BypNX(B4u?jMPb_x{@ChNu1pWky_ip&ixRwxrD_~|A z&T0x-xSzmv{(KEv2Nwq%%9o^ion;IKGm{0|5f=oEdve99$2+X-&FskTiH9j-=#_+F z(3ka7NFCAcR_((n3xd!#D0g(r71=p|y4kq_$>wGU+Xc8S;nB`6v?P2TkgU{5^QElF z3@i%%WbZ*RUOc9>d$&4+;fX(9$5aBLBU4gf*(PRhL8^zUebjPqgzojf`}mGToCUXd$D4^aan!i(i|H3- zo{Q%ZBa`wji1*F!)^@^RXW}{96Lig(ho|M9`g!Dv(Fd1kQdQY8|m$db@+vaq^`F#^DQI9b<^*e zA8AQVcy-BK@Fc3HNJ}%r%yW8F&|&_ct(^zb^YTP zozg7aWGy!R8EwJ*i-Z4ru9TD%Ca=(|$#zp}3ygWT3(_&RTg_ERZmM$z({=>x1O8NfrGo+5&7x97~}KJtqop*B(KzOFR>yf8v7IQ zlPzg?S4lGLwp;zh?THb#&I^{q50~qBj1{EIBm`t;8SZ>}G6c8qUf+%YV@-lgiuu8| zh8E)<2ha7>#qqWE_1|^q&OY$So@i|M`hY+8d>jPo7VE_I*x%TZEQP0-2P6qkJlMJJj2MKSjzojkTYylD=H zT9oTPP_iq`P-}|4oh;Ap%*6G8Sc(&KHqtQ7Xi+Wu)>snxLaQ+I`~8NuIYTGFRG%>n zoK07F6dd{dlsM6kBa3$m5_PxtCA;;But~IDvI19~Vvi_Pms|?mSShfdQyn=RI68LG zwO!?lTIyW~xA{mioqg_Lk%~g7YB{P&eU%dz47`1eeYvs~w1usYwVH4HwNa_hjS)WP zA7^uT{=9W4yMwp40Ga|X2Sh#YE}F`TSYHxbrdg*v`@OX+{;mWrHgYiQael0yWC+&+VP zEL#~d8U+a(`CX*INOmf&Ez7C;C06H++^p=+7BocpnuPd>UsEP|!<7|qa|!Q&dzJ0# zXlCu;WaL78cDi<|j#``YJv0LvQ_!{XKE&P{3r*#{=@%z?TT`?1Q+mRf2Ou>zm{U_v zKNxFoj5K`Kobrv{0_);1S};e5B0=lBl?JFK=167d7oz4R`!`M zBxcrkEBxbcya-=zddTy+r>r{dAo;lq;aTUkvn`7z85T0tvQx*oE$ap4K%JsM&t0!PM{6@Ei8lRy0W?pTqeRk7Az-x9xc4oS}zWX4_zIjWR#Z|EGXLaud znwyO8XbIF}kUhPRn!J?={jTo@u}|BEofj25x`X*>T)Vq8f<^lF(;OhTZ+SzA!~DkO z#rwQ~(oMC7gXRN}Kba0irM^?HdIlxwc3@da}jK zfr!2tu4+mAm?{AwG>vPVa6n~J&yJo>hgR?f8dsip0b6_OM?M$t=aN0gN5x(g74gAr z6k48gmmiE;ur5zGC5XO!<`f#FL~Gx5T{`SXe1ol3O?hyp$Y90xHjV3RnUVxG9$evr zPaO^)#YbQB**A14(*cM|^u??ml69pSCU_4T7o4VRFj5Y{GPo_^UfLc{N#b~@?6ma6#ZWpjrfkPh z4np?h${dVzbsqhO)n=z&x?);Ob-ov8o{QpuxA3cV1$eglPPDH-Lh zEis&>=$%nI&6;P!-kQ13^G_8vUEIyNKG1VnU0%u6ng%fE-H96QyeCdKy!a~uKHoI8 zuvAYsyeexTD{82=G$qAnzG;w7vfO_9ZFOHjpFl-0Yd|e!w%bYOCe0^a zUiSjN!gjRI6s@YgYOyo8>%n?kWpwpDfunua7wIy@-}nlnpsq62$!|QLA#AXaprk~# zZY5-P^2znsc-eptEe2`|ga?^wg1_US+h4aQnFonIlNgK$dEr!h1WhK}Tk1n|CrZI$ zs(Be&R}YS_k1gmzvJp-6Dt)FW>f?Jml@Km{yvVaUBU&+vw|wO6JJP4BfA|E^b)3<$ zAoB$Q*m`k@mwM21>#52jb9(?RpZk!(s8FUumhR zs4lwGil9P-wIZUubT?Y+>Xzp;1ub+WRdA2(-hreFGCvlLoeHY(iE8y(~l;vG`E45=tw zH!-5JvLpdFZ&Ka#RoaJ9x>`EZI`EC~u4+X8$5TNMJ5=w#-mRR{x}u|GN7GH0xIdGw z$4Z^CxDc5|J}@xgp>SMCcMR3s!f}dN*m3)^@VKABf`WwIMGay_GRz81ceaJ;LiY`Jr4JkBgdI-mtqz`n zhL5ym|Da2Nhw`P-p~l8q+0=RHwDK=cY;R$6ghoEaBlimyy?pm>sZ&>Xo^$l1`{^%^ z76;E64bwNnIH??SyXJvX#-Hy{_Shv-e9SP&@l#uSa_$SFK{jif6Vr*>6Gq9x=TtxL<9tBq&uX$ySuwZy1V0@;B`OWua9H@*~GzGYtDJb zIL8><($W%<;qQtMRmqjSes#1&PVfut$MEpwM{)Utl}3SH=wx5e`o*RRhDad2O2%xpqP=)9ZEKAJSVaAv z(yKK`asZ|`@K`KKFWfq7&i6in?r@DhP45=HGiW}F(Nd%Bi;HSxzSynY{*6=9*m;}Q_O<|?Uj?hc2AltWxBI-SYQC}g!*c`KB`-xhpS@+VU4!^6u!r6Mz>CS8UA z1mGJ&t=S;hbf>AQ2|te-=I-6s#A(JRiWG7@tO+#X*9Lk)g3$ltA;A z+tAmURnq);tApWu$1#*HF~)G%I{!xTpw}wH>q&Tf^E7<)r5EvOG!1OmuWhQr*px#e z68J)xo)}^Ruz(|G{Ww9;vjXsodwO=WGE*h^#%i;Fs(7Kqi#7iPhTB-;@6qL^Kq{@A zkA-9&cPM)^JOzjP+~^#1R7I~ZT3NC{zXxsB1hJXVSZFY&gouA-(R3tj=>LdWE04dx#+%Dvuvu2 z^E(r7d$TG`cGW&;LaM}PuWr6s2v1L&Wx(vzN>3zf8x#ybG-*Y+EUd38C%f3$Kp=+1 zhIhvqkU;AppZA(ulxJ$2luvz@$|R-^}ZR24tD#Yic4Y)KJMpIhb6e>VljD> zNpDRohi_~t$s>{LQ(~2hw1#@=u4&)uhl=lR%Zz;T*Mx*iwr_8#(;|bspGlcsAe1l5x^z1>1R`r-GRC?Jh} z4cz#5YJ>(qdq${S9(M^At#kj)s z-;&b#>da%;)V(Vc8Bk~Iz8ra}nsIUxO`s;9BtVwDG1E#`T|BGURTgy>{ys#*fb}vC zk%LhqBPXA(=&j@N!mkmLJMI45DnVDAx%;1cTxydTQN;k-Gfi3yd#b0k0 zqTPjxhlFKhUh&-b45!SzF~&omS*vyK?3%O+;H$6FF84&w_kBL|CuI6cQ>$H~hi#PC z{-mEf?zRNn&EB1&WF1NP`?G6tdWPKe$tno^Y`XkLi196iIfDC5eUdd&Ga_EdT7*K6gQKN@`%*f1?6ZeF zHobxbZg|r<3N9=8o^wsAZqwHi2DS1rb&UybkCZYt8@xal~Hwr7gxskD&>TC}L#qw03%FFQPGva>hxV>~eF^&@W$WS=i!GV#2OD9%^cvk5rbHrj*|nyFqb+i|98PAy%waZ=({|2cgYAm2RN@H$=mE(W zzd(BOt8K!BEULN+jk~O@4E_IS(Eu!o>5CTG#ryd}`By}m zP%^-JvFPGIDG7aodLB`($?9K*`cGQ-mu9?5V7n3N5LeB}VR4N=%Z`1zBc+F2e@ZuF95|D^Mprr7o+ge)%$l_0)%!GC9`gkEc zSF~&5sScei_0>$qw>h|L6$AS0wDc*BPC|)&sixgFDs+?@u33RCHE9`i(fKXx*I2}d zjov6Z@6o_Gz^9t&(bC-JBY(r;3LD;3j`VO@R-c`$Tr%JJb-$_HZ<|hWd16Q69A}G3 zy|=E>b@nZu09%am^>L1K=wIM0i_vb*5|ie4S^aO&FDRG<-!dqew9Nk>|14DjqI_YK zP#|GVH6Qe#Cmb;d>=1rW=L4u;06yhvlcYAm%Ut0ylYa!t;QLGM=tp^TN$-xkm5(!p zUZz1r-8AFMd((w#t#{D1l&*&X=<0HTZ=PDAnwob~h9jxYk^R%f->oy0(G4lsdeMG0 z8b2JfO2&9fV;3^%WQlP=LIn2&-#Kvj6_Tjy-$YL(oNER}YtX8R#nVr}(D>RD&4cy|$fNJU>1 zo%u-*{Hm&N_s^i{5g$QH^i(FyG?YU}n6rb4qzF;FzNlb)e0(oaN#4&7H_pfLUmJf% z)hm|foOv=_9}q{LlOq~UvbzARrVY_=NP8)4BFd67RrjB z+uq~hP^vdNhzUeiR#xy-SkO69!^IZqVS{6kY6bQA@%pdpOtY>um6YKT8Ksz&Zb;r_ zro0=IO_5Ffy)W#v&0@j4!H0m!r+VVWOK1#g^%3LNhRK5AnO?^noC1Y=ij*&0 zM(nyi zEd49B)*LMo!k^{cEG-u8Y*f_-x7*Pb>6dy(8HE9D2%O>*t)w%Z3`xiwC-rspdLphF zJ0ua}uMYT?w4n)-(oOmXeK0U$h?_-Ilam<#_s>8ZkIU}K!gH`-f+(-|uHWJxK={PXN1OAjUldtUT%g9|s$*$Gm9 z?Yo6NrO7Zd0cnZGr0E{&zvY&DGYSj6jR7bfQf7jJcyOM#4~+)9*?wKf8I?irJ=~qP zc50`{EEK6nwyzz zjOVHla8r+#4*%A=;#mI4t{i7z7|{5?tEw1XB-<9_-2du*`@o>|%_U|m0HbhCcm<&U+RYmuMNjZH*Q3TT9XI<_YQo^vSnmcc6!d_(}{NClL;3vrLNlYAl{`%OX+({>&`L zwErtgeSGlIf3et#DLWY|-gbrT)so0t^gt;gk;KE+>~Yw;YBCQMNjaFLTj(&U z&E*J>uY&P#hc-;Tqg4@ZAjXFT*en~pNw=2DhN_nMTj%pK$R!Kh zv+(k2Re2+9FewbpWq&5@1|(TFb~bS0yE`eWvQ%fw%yIE?;ReTPJG#3;$M0_`sy1J4 zDu35STN>)?n+7u*+a&;8tD2lIIsA1IYl&|_C`CcUvrTbV^xpGy_{ znK2sG9?&ELe7Vu#@6NqAO1rMVSKtjk0YQFYp{#A;ck^%-)vP?V->O-{Ch75Al0Dh( zGFGqrMWbeL2vLSpxa)-o#RKKjq9pYD?(p>DGA}AExk6uaNE*sgKTQC^8Pblgc00Rj z&1T5|nnpEzTfYXdo$6HIr|K~gs)x*}uYr8%PetKG<_CuL3T#!dI=Q62&U85owYOJ0 z=V4!K$8oIZQ1p0>nh-KjZRtl<17$X}DuuD^1&WuN(CD&8@4)B%?F{Z#FxelD-j^Je zFzGdq@%zMJTT88Ru(Hz`Y2>+^%;!bteZ=vPjLkmnkq@_G-{DXjO8m01)P7r0?zFy9 z`7<1?ZjM`YAZSLlV4*_Ds2?3$nJVIrEkZc6Fe1@CaM1sACMKx(QU$Emt?==`-AGpD zbvTGMN6yM>efGA{QkVT<#V~Tg%407}D-K@#j;9 z;i!Nrnq`#y|CnY^HGW_$2PP|#*#DFf&SWJ9w;Le#!j})F44dtrkZVBui$8}yl5)6# z1g^cgkdblIVhUW;2X4=swpqlS!9jYDHNK(_#A8ldSfwwwuaIS9&v|N{NjM?N>K<+M zCA#Cz^vc_^&G$JdIi}`ahcEbeBI_X@u4Xa%Bsi>n~~4vaRC9@VM=b zM}Gem#^8o;a{Z$!d^}Rx4zRdXGQ~gJ39h!bOSS8zmH`?D<)n?b&9_Geqj?l* zn|Jb<%&TsJLPqE%;gze~=TOPOG005A`wNyZtH?gWwX;oVZmbR;Wbom7b zif{{5^>R-`m0M&?R{;ZV%&>Vtt%{M=Q8sc})%flj0|*9E;m~6^@L%?q``XG(FgBY# zq$#KD_tJ&mjXD~PZ#fC{`7<1b&m`v3Cc!y0pa=+~uJ5*n+A!k4+hJw%UHTrt) zJ4SMZLSGwg50%ekrX&%Y&DCgR_Y9@;{;ri)W#DAN=}g6^b4699|K<*|c_$(Bty&FL zgj{Iv_PYB-!3EmdyryHOtR8#pC)M#CK(4`?=th2_~< zXmDXof2{@s^mVxz@h^wPOoQvsRv@Q(N}tq_zIaO5bye1azKQ5R(lW4bEnJEQm4U?O z)qnxy({DWg63DUC?OHxa{~9D(X`dqBA??JqXCeV3O5*Zta(}~1YDAg^A4xI9;O0BDp@X^w+~>ZYALN6$Pa~Ib z6lH^cMRY#X83rVvH@P#_6ou=Z<(Q z)oY@~K$EK?(>z)sV6LuMx5_*RRum~Jd$JzoFt$*yvUl8QZzWu3&ShYsU70&Bncqb! z8X-Ef^tCe-gj3pAtrOX}cg=zi!cVx^Q>-Qj*lyxSg|E#`C8_`M3MfbwmbxsiSMB}8$q&Act7W=Z?(A&#W{_j|O}*-S z+xz*Q(q;u@$+#(wC|%Erpcu@z{>?hmOpHFY63_cQXCi{XpC6*KQ-IP61lQ_G%iErp zA~XS8Dfyh?-ZIr%mK9g+t>U|2Y zYkdA}R~CCtuYPz0yo}D-phq@X?-sBSKy4;FA79pNJl%w+?c@R}5uj296{%ZbR_tvwps`Og>D5#N2zlDI_Rh zDRck;J6TSs>`*sJ8fVbK;o3~Di>ybH3eRTGeFU{Qh;94$ADHI4a@L7p z>Jl{uTGU`0_=4FoP+MF$Z81_SVH5GYO=Ri>JI`}SZX1!DVc#>vqi47hEF=t)k@Bke z2%2-o4Cy-AZSr5?X8K(;m;}!>zayM;ww*}T;Jy)h<_&-ne6^FF@<&o{sIWBu1;!6A zD9|VZrjJ6{N^L)O*++Vfw{tT+KL6G76r>}?u>@IKXg{t|kL`8G;c zd7F{7t2!onJ&?3m9y^@iY^PId9}>k6^Q97#*_T%Y;PluuG`GGA3@W}Cx{6245@t*3 zfsVmlgET)WrnT~rYHx&(Tf$$|6*6KgJkT{5HoxIQ|Dv_AWQ z%)A}Da0_u~7A)^h3~5jRy3YFZEkRqoW1c$v?T~q4p14wbPrY;BMmGF&Zv{5PL?U3POx)qTc*1o+ z=^94ndmn(TaH4JC6VW+{J%mS!N%|DEe9F*LPxUvmk-w-oy{)w~`PP;LuS1gnFp$L( zo7LWGeWIkC2#;SKnVZ6d*YF=kKkVPSq#a#v6MgxzTD`xh@w!>I4|)5&OV8Kv-?|;P z`+HJq#-FbD6a7I_pyJ5@8-;6O@9Fya)lwNR}2W{p%OC$=U%U0ma0DgJwHe6bg> zDCW*~Ft4wb^7yoonwHXo3DP7gKRRG-omU==^1oquQ;Tpi61m%R_q$yF7P``6N|=nm z1%7N`hUsYK4~Dq^M^JQy67uQ%7c)%A1UFkl&(X(mMg+nU)fhi;kQo5Zjv&O#8!@q= zT!@h5MQt4&V>fffExhO0-t+av{^!lUEGnnU_U>TeX+%*K^u7FfJOEy^rYGF&ySpO8 zaSv=`sWy0BhC1dSoDBDoKu zDt+=>kDygPq|9H5UuiKpL<Cf|1z?i4*jl6I^y#bQNKcUg~rFQ znliS&l~2(DNG@v7|EF?Q{we4(nSlNgU@72tUqDC+>yPea6;L>uzM35CYyH<7J}tj| z9^}g$sC?=>mHuo->`rL}{Ajcrr)|lA*oANcEH!J*-Nxk{#{^`jkfjO*1OLHT2v=vk znN!V@HEk%YzL(!~!WHE8ByK*aWN zaZiww^JXwhGb_(!(n1rHDJ?}@fS*74gjT!GCGH^D`|;Q>U;gDEm{7uv+K-5pt3>df zJeLP92g~>KVDc!2>g43KGdGzXt2i;ho0F_7V`Vk7d$ilixkXVH@E!jAP{4g8O|RNZ z%XNLCpXQ8&N=QxIbv{qldC#J?o`mdcEyZ%rpU6swo;?l`u~#C`>{>u~+CozTpIGvS z{;R4@Iz>vk34HL{jtHP)J;`5+HW`=s+D4#H2!>O*l>e8>$n`aAS{mX+V3AVjW~x4& zv+&GzqM_CeItgh>=X=~TI^8CguL0K0y%j5+5xfPA*eX#?vB8JJ;Yhn2}> zb39jNypIrgzI{KHuTpK%*(ZvS03^uaDZM~uD3{15T8^qZCoEFzwWLbH;O??VY1RR; z{zYO)J_FRcNUvX;&lf8A3UfKErQ*oi1|2w!SqSYVlp6iSvA&YQV;AE6s%TEV8TgJX zuQcrgw+l|Rg|0dKMjKFW+N4E?f6%>YqWmvBp`%l)w-`@+?;SS{#j*muU%sl+e~R1S~LtxKx4!DLsp zC^4nOT{ORO1IfBlxLw`5R=Iwr@EfFbb#|T;F&~vw6DRSM-9)Ve zW;>WtlsoqXr;1HlT%WE#KHT4e0#xK+G0RArdzYQ&s#+@uUYzS-J2%?eVWs48E&MM# zJG+KSAB*VR-#w}c%qhloRYm@LG>mcAwbU(JbkA4uoEF?4N4}?v4%8OS)mH^xVRBg> zYujtW_o*Ik|DCLr6)%z$4%a+?J*T#3B*=+!mB!tF2g)B25Fa5MGGHDCVT`{ zN%eIzfh*D9So#z)+y<-rK9$pJ4~I6TrzagJ?DJq)_ID_rLqbo0^FK&WN+Q)(zS`^d zXUaP-iq>>kO<(#GZkyH2^jSzOOe4rpx>0_uwQUntn~R5@9hWfc&r z!v6|sF23sc{YpwJ=8Zf`+VBs!>Vu`^Qfm7_dMD98``BN-xU4)dr{ z=hH|tazYWkKBaiQKLMplE`(_6${D<9pDmZ`HpDU6+;;*_rwYf6>{X;{U%s1fjR*A$V&D zFF%pvw}95a$+MKS{z5L1iBv4=z$wq3{PCiHIxReCBhnqa*FL`g&|S1##|0Qi-wG2{ zT?&N4_p03vYc98Z-wCTb@JSR4IJ_|%_Vv>C;{K4pWwR5?uGPU01J7Q5e~O}min6di zoi6EEWDR8P%;QuJ01?KtmaBOqZ<+=EL4(U)TU}><&$mw24cNW2M$fgql_f~-T4&T7 z^YQd(5-K>%uX}*OmdLW6{w`YRoVE%R>UryfB8=VN0p4ob=Q$I){yGulSo&!Y%)Xf>b;RhCbQl}NWmd^aV+BKy)wHNJz zBqk7NqCf7nwzm|3^eHwfVddEwfExa#4VCA5SEZEUFRQ^++p~Ne`W%L+wYsbsSOAoV zH48=GxP%skPz}b3(G_tXUKjeU#peJ$;~%|m0rs*0q$?S2_gFnD1thVr@!WIVnAm?x zE}yiwj@AIft@j$1lZ$f*KW9#5h)vq%mAi$m9o7c^3Q#-3E5H4cZ>hILg1a6M3_lR{ zPceG_3IzHtg);vrSi_XDi^ZlF0;(cq1qV~zkD*{a~uJNArmTZ^kB0u-hO;5X?Lcn}o zizS`OxIk0;^72W>0~C8J03qJ~wcycyp_^2v z)C5t*PbfF;d`)>y0N+=ZRnY%^jNb10z`qY2n)`RwuF^sV2V)_Z>`lSEovbVRKPqUDa;KHt^4oNF*`I75KDGHTz1i>4M51S+f>tJ%0P$}e8r zroDG-4frz_VXWaVyf8}+bM;aTKs9TU&WDTt^n}LV>kBTUS>1yPVEOa@jFZ+s-*7H? zi4pl(+=y7{c{E`{rjVg|ovfpUpvs5hQ3~)e1!amf;aLZQ1+|NHD(i;5cOCq-*2T zENQ#9oKTaaku3$p5pVfr04CS;8NY(|ddv0}^DJsGLi!gf7GZSuKD|_|(4RYhWXV)0 zWQK&JdSff-Z4-b=)T12OpPN?)*p12|=WGVmao6oGszuXx9dh3g^O1)t3R%Y%=o}xk zdlW^2iKjw(KX0wlu*uq70ra(lnT1&y-+S-iLUgjX=mmq%zc?JaWvQSnrf> z8`%1}G5Mg-Rya5cW?R5DtL0X?`GLDtIedFB{y?H|gS@SMJ*mZCF%z}~Nz;-Ll{6B# z*JkcWgTbwqB7EqGZyppW7MibIl+MQf8=3IIo*g9x?&jeyPmWBHMCtLHH)Jz8n6O2X zUR0`)Hf{;krhr*6?Ww*jbZ$Al9@UFQ^@>0w=VW2r>*1CQ3F|58MXhC)+Hc1TUn4r) ztOfZaU}{#IQMeMx&;{++^F>-hcx7W;;l_1DclPwu=p7ow*XFFSR*y9RVvgj1x9O_Ff|1liRe(y54=hVGeP*M^8 z6U<{Myy5Fohmoa@&pF-POYqH&39a`lgH^XSd&y)?>dN%StGLya3_W3sLKQHndYuaP z^{tG_m3zLIEE$na-`So8iPCi>h-z1UeFJ^+i^q-f^1E=%6*SRW)S7+?=;4$U5mW(k z9VuM)?18jvEZknFe_LYxF30vDc*(2}!#4%~?hJ72gO}O=l zM;e)OWYQ!yTfORzLcdJgE~fgqR82MpTox}hbXiCojMoORxxJcE#j@(FwpjWy879-L z%!UI54-u$9nPjqt`eS?2sHCbY^uMR0@kMZ$2vUPb#GOY)HPZIW0*>H3) zE4r2@1y{&$5XwaBgIs^)ro8vf370=*TgFU{I&jDkW)`x;aPCnFE5mgVPzE%CM#}`C zr&qiXoXk_JD*TXzua?plYZniISVJQtuJwjq)?QOTrEKlj6Zrz05JKA!E(hmwwb5Tv z#yF~Fh2fIy|0LVg@#e?Jk60-CYk2h4nvWRhRIeccSp3(p{Aku1tW7)2Qd&K<^~}ER$|yR78aIZ0~-( zD?rscR1TIKT|-|>LiE^Ukz-puy_*7FM~gs)82f89J-Kt3xRBh zAK00@m7A*6T_bL7?1byNdyu4Nd-WP^ao#nJ9CG114UQJOWw?$~WT8>2?a^5$tLk6J zkywTZneG-qms?@%G|E`0btsnw{x93!XQ=s-a@qS* zQrvy~gO-Wa$zFsY1alqD7!)z=6JN;Q+Q@_Phn2uuF#0h~gog<~>}9F5L{+h>(v5b8 zu$DE0wSy~K5PaK%2yD%t8UzGJV3#y)xomTs3PvVo%GJK=#clCqRDL!AOK6lcV_Ahj z$N3)}q71>;M(1W!ApA?ERwKsc05rujQ2X!ppGmbX_R95mQ@EbHzOAOBP8X4(nzJGdNwimtTt2XI$pU!4<8%tjB*^s3MXHU*d>J~Psa>Gc&cBYZnzLHsxXGAw zi+lV*J8~SzhP%j-CRGO-%%X#$Yc00vg2=8%xr&07$}28D8kMf9%1l=J3-G>gPYY06U60k@Gr$`lY^_yA;hQzl#p znOuYYga|9o#C3> zb+2&lwDZYgPB~hzsR5dsl}B1WnJ;=wM?u9`|39}mlGKPN!c0UlR^&geOb;7Gl(;%B zkEz_^ORwt{=6}uk6Jrmbv!Ym)of+lb`eEqz9|{f$tTsf1VpGLBwzG+y)p9z=OErE) zzqGWXKcYNl;;puKqj-rSC-U05=f$lL>(%x+OQ_8_t`GFmWYMeUD@W&F?{%Jn8v%SZ z(fE&|2`9aLvT}Mk9~lnLZa|jxC41RBiF4+Ga)7f2l@{Wd-;v6>P|k*3yj<5>No`^f zEufRL2l6jIpV_;l!?FC);f!JDTTlzuy=5duM_r1a9s z*%>Gu(_X2z!aN-a30U8`i%_gSs=8~p!^`Vj80jV_A7Knsg;&0y^h`2jbgFW4q1PLd zViqyNeENehF5id7`4X=(y$tonpx!9x(nzgao!x5Kl!gY22SrZP=`Yua+0LQUUy3=+ zr{9o#N68&NKV9EdJ4QNrs(vlTaI8fW&|_^g4V2OwDrH>;*?82xL#k`nJO5qJFNqoq zy~A|dd3^}cnuRP;8-SUtsQm*}7PSbK;Innu#xQAoI#;((VJ^0$kzWA#jONda>76fT zwqdlHw-jR+IO|g@-z3JR*XH{2B;xd`hLYcmcBcx@o>*ct$}736XlO(S-hTxX*%NI; zr(ZCkvQn|M34s^-W>2@)1LuouU0*&pX$6x_&mWE56<2XZ0rcIJ2g|(UOng}Yu zUTeT><4eBKrrb63U=nXRn2HNSBS_9pUJ@_E{=h4nXX@0+jko=S)@ju-nR=&kB=-fJd?vrI1E&T+gzMeu*NV;XxGSJs zJjuxU>HToi$sRjCr1-rjrFCZHd+=7PMuc#<;rH*PBSVS^vY1C(fKLMip9@VMIiKhO z`=hFONyhDM$4_Oga-MD1A`Y7*6_2K0giq6cjJKRjOC7)c--`Faz2AlXFPS{$SV3sG zOs;evgaJIIrzG<9;zB!;t-D-oEDuhE5w3nlJ8(f}2C!w>-rsI{`1bwNW%<#2EY=Fe z5Mkf$cs9Fq;>iB95e9a$K9`y-X@=WKoaSd2rxqg(7X*q zL9>m&gDTQ%%glPZebEdqTd7I$8-HvzG zzb0kMs%VZWUb^;o=fD{iKi{1{9=s#tSlPRs9cETi50z7*Ziq>Ftu zRurDOLuwlqy&EMor3$dh1|9Mj@StQMO5KYumsf$*=>`}A#n5`D`J;S$;gH`-@l~^fp{>G=HP$P0ik6x@s6G=?LL=If^<@IWH-WtX zkl|%k}<|4^kbE|LpOf{32QCzXvV}e#Ntge_jg@w>yUOBb*Y3YRPSs{oLSy@8ye$ZIb*As{to}V$3{Cbd#^&E`XH>L4vHZ+3SfFr7=W%}c;V=tp`4oqrUo3*4 zg2(d-$bJgC;MIcq&XbqrAqpUqv@r&1rQ5^_7e)k!a4(58AB3)_6re{8!>Aw}s;cejxBRcg9vja0{{ zON5sWn|Ix+XAHYyFv1RIyBBTlErRFml5&8`? zR)O(P7;59P!Q<7{Cpk&cAd6C_mY?JOZzU6i9LM`QY{&}H8;GKMAu4+NLmj~yWO z*=_lxwinh_$Nn;&9L$=`&a!lNj<6!{vooVO40@O=3kW$ z+G3S``**OOQ_V&jnC7WN0Js11^BIQGas3nl?>kAPznBg^vERm};CpdruCvIM6sjqa z(AzO6m9xIWtUg(MraKzSG0nH@2L_D9M+|Lqv5tL^fn0Jrd(exlNpTQxc(A3u6TkwV zsUNzB$r-Ql+07_(?7sb;@$G)v7bLI{XK|EgQU)LaH@|tFivs_HC-R(?ei7sO|Dm4H zsMM+zcGkCjZGu8zD8&L5^2mE^%6!t1 zXWJBG1m8CQVCDSyt6}>Q8hGhg8Rl1h|BSmp>H(6WX8G`AqS5~m>Hv*5a#OCn+yyxK zU_bjO2*yyZEDZQoK3>f#SuyAL3mD^5Elp$|jm-vk$PDQ%U7GG@gq#$q4D}xlPW{bCoA(gQi2MJjyz8WPH*FfNxN~?`tIE>I{+&=%Tq}WrvS$-g?pOS;p!2@sG>wD=Y&WzA#kcGyr#a)B!xP0y@*`nS z^h7OkjjG?Tazz65JFgGGc~JuZ=pTGlS~Y{mDn8&t6HVFzZc}?9jic>V{y=@_dvMer zm^qnrvnynQsVh9a93aci;MzUePvQD1Z4Sd3vD=sx=kyn|`sX}@YrwD4Y^vrW_c$23 z9L(8FwSMO*3~06?6Bh$=h?y46p8$^7JmuU5kH5%xLxw<=Hs8SKO^Fn1_xPvz)?rU@ zE?qVdc(saO9&wS+QufhTn~%D;Ukn!b1L=O&{cAoQ9U;6^fr|qO-nBVdv`lz0LKq_cm!DS(bxfHf-_EW8J;j zV`E?-?%H~Zw?C1U_2tXmYO9F|^q%Iy1a`B6jSgNpV>T1hG;<0nQ&YFZ%mZttq{p(u z79i*B^2D@XZac+csyo}Y-?~EktK~uN{OENT#X=!Ppa)#$chs6rO!8~$>L}P=b_KtH z;GkD16|NjLY95EFxor_RnURo2C-Hd!0Y~EFbB@rl(f4^*C3O?L1K;XU5aesD7MmO^ zV*Jy@W1IF;(v`r6G$W zOWJCyHESeklHl936dQL{I*C)`;(XT)Jq|EyOwm7sLf2$Cr_-BdiPqL`TXL2#!~B8s z63R+Z@S)87?@7I#9JZu|Bm@*{Np8!r5|wdCkxMZCWS<%HOK^AkkPsRyIsFM#`5dU{ z#HyOvl~5iRhdNCz(GAp|8w<0yC%4^ zg!5HToq9hk7I#AZTL%*TNeFt@lAWJFhsd7S1@WbufQr|Bx+1jYVi1k4*%&+&>i1X( z71|huTj!M?Qg(CEQu#jRE}&aK`DOo6#(^cfUtfDs;H_Y69C!3jM)8Pg8*R>^ikQTY z=LFlxznAMgZh_3T@}YRXjG3gRCCyQ}W4;<3WY*kr#e;CXPX=0WACnd5>h)+d9=l8R9icUb8WVwe2!0wMN2;r zxrA0tzgxWDV4MX4&hzz4=Z}(ApFf#gWnXn~D^(oqL@Elzk#*w(GbdKf5K(m+k-D=w z%Rh4LSGa@pz1s+7oqk0{SQP7mkUF-`1nLFq)lzDyZX)-|ICT)C65n{r$uamx&1lDH zyNJ`-hmdrECzO3a42!iP@6xJ&aFIq0uVprUS1h`I=)v851jVa|`{)-!xZqM|l6|<# zo$k)y^B*(~JyMX#02~d4Do8R19{G0;LM$}Fr}tL^z*H3_9~-L=OC_@7>98Qd_2_X> ziHlp41hn(qY1ZSX_}QhHTQ8Thy4+fsG9`p>KRPnJ&iwIUz>6NBuSsQ8pfB=^{0wsk>x4(vYvEJd{ zd}2=aMzBQ_3YiEoL8I?iE2l@5-4ugXzxoyx*legpJ~l&lN5>g|u!A4tG+0eI4%ROC zV5m6palTuGCN)05f36AGHersB3L{A3cjd^d4i;D^m?(&)4sVql^+3KOL*x?npl$_# z(Bx8GZQbN=Rjt9m=$TJd;_;2I7vr+lE!7^_+!j`&z4v*Xo`hPYhs*g%Y)#YV0(1!k zQSC+v&Ng1RON7qaY4+`N=QxiK0mr2TT@)A+p*PV?A+~h`ac!Y)k|YqXFu)<#P5s2| z$K|+t-)DN7f(iNl-y%st+(-sdt7N}@ieDgNeQyaIr(%a?V!CXDb~JyA1%fRcP_JL_ z^+(a>hOE#0#?Z9V@qJCA->{#!H{o%5QKq`n%&XDtm94>V?e~N!+=cRX){;w1`5HgNX~Gb znvF86N1$h=CdbImMsJ*WaHalZ8lM;fJ`!@8+<<(Uj28ZHcI zStVCHEaVD0SoAh?_o~t7vbm{Hv+4^pn>VUw$6-+(>rAR$vMf}3RC$MIXfP_UPw>6I zm&F4k0BKF%PeeUC5O~w~0VqFiXMv`+H=~P(hX>@C5-xCS;cW(wCsHLOuU><-l2854 zQb?X*^s)DX-%t#USm8**-a@N?40|i!Di-qe7BvBaYUpcYt$KmP(u`Gn70A}UpU4!_ z)%gpVOowk9PdR7Cb?Px6{en#Twz+BMhvk#66Bt=3B^QS*#*~BBmi{DH>z0|>FNO^1 z)!D7jspw?K^(uYNIgO!H%y4TQx_9^FbiX=tW1Y6r^*(F`z7X9~rSZ{uZZ#gcvb}%z z0gYB-XgOi(^OgS5Oyh?nm@awi$-2?dV63SWdUB5;(L<+OGOEzui(#ahCvv~ndl2GRNy}QZ9n1iq3kg?&}frMqsICYc~QjaR4c=H4XSwqf?8M+gv_{YKzN@4LJ2lKq$$dI;lbr)PruE%y zJtt@xaw?ew?F!8}nEm*3`~`VR>J0|n5|FEWT3%qN7~U(M!pJ&X7cF0SkJEeGYUkHa zGzekn$Q6)da8c2Qi9ZT-ca3{(EYQ6pfA3YUod*RCoppRXxr0B9)R%RxmCjX8PV7~g z?c6t=KbPlm>)ZaV2e!`X+(8GnLaUqi)5EH0y*$ryqqqEYRAb{kiXPH!*&zuJ$jhFS zSaRLx3Yqq92ccRB{NVqw_uf%aCtKU7qJT&eMU*5cNwQ=R z1XQBr*kn*LNX{7s6_6yUk&KdaY(kSoNlk_(=bVvDllm6UnK|>GIqSRMIqUv)*ZU8v zY3S--RqfjQ+0PEe!l+nY+*G(}+M~UWvw?4cb?Cg+|Gp0UaMfC=UR6_9 z-mDauv=-D}EE{(>H^XvxhEKgK=x=u;+O%;G%JONssSf+7#_cpyz{xN#gu-e%Rb3ax z<0kIinw%6_=Ddf+qyGl?86FhMeBTe@pgy0~2fw(_aLxZW=k4({1iQ-DXedp%3^SCm zi0-97*YbH-GPg!zD)TL+UC`oE2Ety0o~c)jWj)F4O4`|6*=~3sFxb@e@GH|oYn-o- zXybdtJ=pjn;!ZafbWH7+5V&LRE^>@W=#s>0ZELIuU<^H)acP=C_kGqZg%XB&Kk^k- z#r#NoGeJPap<62}X5oC*oyUSa+-u}Bm63c4zX+KCC%e&^Ndt;Y22!YhzC3ebHUU`@ zB`MI@oteERVEJZq>7dl(hl>b)ECfkONs-p4Rh}2ks-)$Gt3M{-xc_tSUJvL_mgb== z>^0^{7fc*}7%B^4(SOw-7v#xN?|4wUs?`D^d>{adFPU}gpO&do+xQvj>A^9AA#C_~ zc!S3AQ4^=z!lO-MdZw|{R|SmbV^^cF@yScycHMMXlYoU#aPO?%OKGov0RI?9Kmd|u z)<;T{*izCvM{{qWb~($^-EYzO`flFH_LM(NQ_9gApHGE;p{n6^+ zWkdeQqn|bqv-%C_pL)Y#Z^w%eIC4-%r7;F$rb#-tOpm>8R$Y_ycJfN=LY9~~|b>>?tCJ!JI=HRje?fBY! zA0GTjy&L6(3GO#QM&9Pm8J3d!&|*j*9^NwL=Mc-{dU#e{2z!dOC^<^#UPz;5zwf<> zh>N{))x3#HvMA8CSzIVnnincL<~Z$ z-}nQ4!l0E|qKI?H^|)2eZoZ_wX)=FFLMLMjwH%^W52`?`fs;_Ab(`N6LG)x7Hw z4<-i*8rI*$DuX1?>WYhX4rQ!6?pY}A5l%S)3oyB>xztZjoFG%Lh2I*@DBG&X8pLh zuiug~!&sZ<6R2VjBb<;o8@mMhmwR7D)T{8@_b%OP+}|xeFOBW$$0jUZH~Uq6U07TM z=PmHj84Oac^c`D;uMfoxPuU8#TJaaIF8C&=kRHvB1~1DA!p3g5 zi`Rr}w7-oj%NDpz#c!`j5Zh}UFW*t&+0cC^2a)jB)h5RmgM2WWxUm8}ww`z{>n()t z<&^nJq~;UY%#~d>Glx!N6v)9fggg2(UpdR7nN3YhO|y=tP2B^5Q5Abht^dG34nU;% z2p$#a2@gJjJsy(Zbzd>6y8xv7lHw=!GJ%^X%%*D)<1X8dtmT$NW5^%kv83@V(G3Rm z6Ici_I`sF|C?&6ldL;Kl&{kh}SjV`LJ%#sUgANi+@X5fO+c^C}El)1r$+X2g+e0r! z#CKZgB=aj`#!SoCsA=ZV<#R{l!G%#=@AV36RR>z@q0}6!JWYV34_3)r%?)52=WtS2^sC+D;}U3L_Eo@V#C5g6UIN-MbC zxXHXp!@UW|dI!YAHsxF6X<7iQIUFM*thw*M&Cy7MJw5GomW=rt{xuIt#zJBAo!_@;zi9(9k=%tU#lbIfudO{V` zp6nv1#(iE(RXb2*v0+Pu-#oIoi@!W9Nx-E-qU>oycf3WdswndvYievmHr={f>vf6B zt%;$#Qd=?krmcl=SssCGhz}1bS$Ld~bzL9sR?CkU4j331?)Y1Ig564N?S-T$hvndG z=;AI{x}1KglpV}V)5Lav!H!4LlYP|-~3G9wh z9$V6r+I-f9mg*#h_Z#Bb8(t&{uRCuyJ=2{Hj0Bhv|J?lD4iZov@t>jGY&P=pZKp^2 z=Rb7=`v@4j$hcSgGt&fEX*)Rq;lUDDMkjmy5@`Z)DVw#MQ$N3y6}>-X#o;=Tm9X2! zzspLXqUdDOW0@sKBIrB@&?*M!7^9_`!&r7SSRQ5Op`Q7TTTu%1XZ%A56|}vNnGbLZ zAYZ(B+$^d<+O}J_1w6{n&rdmDWqRvijC-fXw!5l$!v<2VBR=oR%3vOuK8<%jpAKfQ zx@KIu^t@Z6*Zv#+g8AY*^Qe~d!{8lc5Bz}ByJxEEGo3xzu=&h_SuxJ~fT;qKTrE!n zIneyFE9ii~Bw>6;IAuLPh9UJC)`4cdNd}vKY;@Ksuq7#b%bvRRLJNh;UrK@aIOQUl zv`a(S^a)~DcxMJQahKaQ@4#i?3-b)&7Apeut{|a{ zippcoZPkJ|cXh?%u0(iodnfGGZ1K40a!3n%T-3fyDGUgs?9+|B!{0!89K*)d9oa=r z_7=r1L+!{8TX(K17Hc(%5s$Htab!hkM$0{g)!L%Ig5~f$Jd2}{wY%uY)b*a;sxzin zjAd*uI`MhPe9IBFi~r?$jfrFk!Q`EHF%I&_NRsv_J3S+8WC3o3K>i|-3i!DX}yA{Gw zY~;8a7WeQMJ41ad7z4EQv0{kg@V8;9kOd_(zM`IY+CB?!6ionwJg0s1bufinxSp#8 zJyZc%w!L@zIyQFEWbJcro!iyKZBguyu*B7Y^gVg#1QOZRy%pthaj}c1Q4uTXvQtYD z!*<#$>1Zhs>$zV!`<6D7dHSmoXb0@_cC=b9mi=@KVxg$3j>~P+JLnoEp4=-vI ztR&Sx;2m2R?{DtuojaKJRf4lkU{ zZ5lA`8C>5D=001d8n)6DNj06er$da4uMeXdJdUharx8AJ7kSpC5@s}|vX2}-gjo20 zr&0n6Ew+X4Z5L8<<94OQC9J3YL@zgT;^r3K@xtYc=Ji`Q?z--7)l|*Fk`XV?tqVbL5R2YLUq)fy_`g`EHFdkN#()w@uL@}4So z8dy#NBtb>3f1SU(+8ea5?_EnsXbSIY-@GJTCURrEPNDFvK*-)aeU>vaemeBQ#Yxw? ztqp_C+HkHi&4f%zg23815l_PKfEC80BAFp`WVU1gThh2=bQ!&)3*JqF3y*oN2p@xE zJE@Z2+;APuC!a%vxmTn5O$Yg!Pw_}oYuvY+&gql1+T%YaQ`Q)_f7WzSi*(tVOyad^ z`DPsKgm|(Lc~&ZSpQFW9QnEmh0?S<|Vcui66&0 z;+{riitDfAxjR{YC+qw%%m>m%TrOl+8uQOzf5rC}!UOfim$)Mvx44sf%>?uky8K;~ zNRdxnI%P<*`o$&K7>O5!m*4uY*zK;l3ha<#6~m_~6=U?H-{=OB(rm zAK!)zY107^elh1uqD^59lMSVp$YE;=eP|T++tO>A zxx|c%(Z0YY>(zN%0YnQN=AjruxmP8t^I7~)y*quj+F$$@2qisEm!%{B_I`YCq$|1- z?L*F$qc={^7mfesrA>+3GB2tIeFIi$&PanKfiB#Aw2HicaAaFD z;8c{ayx4PMcYP3UDmnk2xs)J%Gb?G~C8Jut(U7d>!hLMFN2lNhM}dpE###Rs?XLH{ z8Vx2%TU0h*0!yv{YF(HrrI-iOc1b-|Fnk(g2<3gO$$7NWo<`iRVkQ8%&|5VaM^yJW zh7;KExnxRkWi7#54cGfKr88K$6wwiP8{EyEZLoZJ6%RQ-@3g+X!;>uJ^+;Q$Ej);m zNHD90lCz*%|ZF;&=P?@egxkGmuyB(vVEJBc|os~bs^OL-6$!a zA%ZbSn#)_ohPEXCg=VQm+}T2RM4p-7H(-UMt_c|*N|W5D5b=7!D~jV1*MX)ll~}xK z+3?l_kR*x6&V5aFsh&1^gmov9)TewMiTWO=(()_!XxivqI);90b9vsM#=;T>}bOVop1#M4_bLGenMF0SBl&GUS;WlL_?pSkaYv zaq8DN>4!MlHQA@v%tCMNh97QYdDA7_3psW(GEh|9Aov2jzUBZtC&yl@=ePQFZ!WZ> zm-pQ$PaM5aYZ-Q=eR^f)Np{{<0-PCKP8$mv)0iR?9&v5aWzT4|OsH#46SD8u#CAX* zKQsI`nCvecm4OL|8sC9>1rW4_vrY3e20UO%e9DkyqctyM(VaSV4P16yuhzjN(ZuIAk?uFRe42b%sD-4p9eF~)p)*WBw>#0#Gnh0Z~jDyaCXAMq5H&*H+F z3;BIjKOs~Fi}rA-QrU>RJJWdN!P=nL;#otdzTvHomDSnmw})Rz@Dp0ZU{ZcBEYUiQ z+DonJ5gIw`D>EbM*3GsF7J4FE53ppgjh3sCxX^a4U0cCB=9wEQd`=tb1gzr{TFT(o zcs+}*Z+A-tQw82Pv(hvK)IB}=)ciOxG4UI#=#nJSrwp34)9dY#r1{k+k9FrVft~kV zydFu%<%$^Pt>4uQ3x*N}ere=t_EF0)GGY9Xsw#QK@0fLFb2G!~=PH->XZ6TgN3%ZX z$**ddyGz3#NSa0EAAA8@Yl zni=l9V6@+F0$tQ(l@($!wnxw?(<$IAmVV!qEni)nWn9J!EU29Ih@#JZr^lq5zW|a% z*42wB&7})~!Pn$tJ3kGcx^|l(Y?tP>0K$7U|Goaf>9?k2pIpnYJCAz379CuU0`Ey*-Mz0@(Ybo?u+Rsm{@IM-jH?YBG06IFk|QtL~S_ z-aji#{U!Fd9@M}ViUVy;q2IbfCbh{c)t}h-6(ELPtS6fonfM=#W>~d41*@BO-+G@@ z`~kMUtpUr?3msE>wy4|@&@I~mzcK&t`>XB%LMq_U6EVxtl}k8Y!qZn~4Oc?$`Msl` zDePpa^E%>v%nv2>dS#aU2X7M2c8fg|M;MMl`MWu%cE!nzekqz!xC7J>kMDn>1}mVSIW{5r?dl$|iC>qOxJtV5b;x7=xT z8WCHd={DENcn7(_VT)2WUYxai-@hnJ9K53ZXj#mw=4fjqcM`;#3Z|+KJFClxV(_|R zL3qUZkqXRn@iU7fhga-pofsNQ$K%p`I(mF9U0opT--DtwCU31=Mvx5ZMd!X_`7A8n z72R7`zJ2xjSEf|&uHwo{nMi(ULzE;;Bw7*(JquH6il+OigkFB-3gvvhorhoz7HqA( zr$nW7*m|niayl6?T=?0|#gt(=Qao5@(j9481Bb_JGJk!;+5XeK)p|1?rd4Wz0;WM8 zl>~(}(=f51II)#0#8%Zq6Na;Ox|*h%#H1(bCdHncslYioWO;Lj&02mFpqgqJ9*Y2u zBN0V%PE62aTM%D75O@{LVMK3q96Rl_-D^BGhY5&%4}cj1t!YsYq$iDGqQezQoS8ia zUp2&H7mT`d3>kbbR%1k<7uB!FGqJ-TNT_mhPa_6oOy~v6-^OxTIuFnpy7D}EtY|`P zI3ksfoQx8C{ei20Xioa-0dq>HGG`gFg77^!$jBtQ-AjqsTk6slc-#8o=<7eU_pQKd zfV}^Hm9z4fs1U`8nHh zQiJEtNf$Z52CgR?s~{-7sHr6-6<8Lf(w{D^;-1y{iPx)eL*OIv<3uL$j**Pw%CaK8 zlH9X433i5_#HWicOG`=GAm1PwJ&ALZ0Gf7tp(NYJVwys;>>jfQt-B>TeJB}KjTb>B z>_jiEbBjgXS{^w)wj;P#<=chZ*G^{PPoqTpEevuhX&hA}S*`!nQX zMIckkKVE%56UP>yOOp&J+@)y{kyLmf;)D9ODd+)&n(L33b_N)-dXhj$XxB&O zY50SB*PS^S#mp^Y+M+XHC>B;{F9ZH>?b*lmh&v#StQD2sQjz@9!xfZ^%d_*6Xt`H9 zPklfU7^tCof3Y3LVsdPw{K)ELLcsnmv#Qul$@xmUr84e_DOLIj`3<4_<{VtO@4EXx zYw`nGb~x_xWnZKoz>Q_Qu@SQl*`8`RJ?Zk1bZC2+!C0r#W~%jA|9RY65bDb<5MCzb zFv?D3x7+E+08F1wo!xo&v+7~XxJV~^$4CV^dG4-AMh9Tj_m(;+Gw2=>g|hW~j%v~n z2p>s-`vNcri#*{zwwm*{D_oZcRZH|JV9ST_j@7nIiutv|N0;m#wSmfImPAf?RMYR- zDp{i`&{ameD61lr) zSM%D#vb^`)jNZhz7Z^e8T$&r6ELYvk zrR!iUWvZ7+0RZi<4dXFn#=N-~I|TIU1r9%XH%pvPh|wAOtA%H~^3Yo4xZT}5Zb~xH zPxq!R)pFQGptaJ!S!xp$eM*yCWFU}W!{%>$p1FGm`)G~?dPlKg_lGCW$HfzA#O3zq z-)NJTvk-jVw3JLyIP~b!Q@%8~$2101KJXU`zp&G-;-^zB)vh5;|~Y!bRx(nlEo`k}9Awr|p`i*Wt!POzV^IOYQpx34ht8H$;e z{__03Ri4?@_V?!i=`&qsQ7K&h0J7CJ0yYVj4=K{K_&v8K3S*6kjkD4BO8xHW!g6Qo zuP3?v=?Vhxc5gtJ;xFj%MpsmxPkr3cjzcOpHt_N-@>|8H-wl%dJd^>TBYt`LZIA8( z+bk2^M%T|oeEEAmq|r8}3AA1)n+lIi)_(6Bo0T=xv(VoBhBXTO`BpKVAFZ?wa+Y7f zp1*kQIdCF)nP(PPGeWxtXjq0gf4K+Rw{0C{@(BWpJae%)!2E%%3_Wh-yk>be_)D06 z$(9`vnCr_v?2N{J(zHNOsBEho+Eu{X>@j^AR3hp7|Th zp}(tu|9lxh&J`G-MIlu9ck=GP7=Z|wJB@cS68}KR|M4;uOI!{xpCysnOWF7Sk6r-(k@5fkL`6sj zm;ZaKjiWt-+RM7fmQ>?)+A@%(FjePqIvb?`B80B)?x;-p1Ud(2{poU?gn+((2VBHx zf6fkVCd3x$RJm+-2jA<#xOQ!Gb8~XyHiJU)BdhrR(XvH*z=37cqyB2~Z)pQG6v!$) zc9~rFuVC(YVFEQ^Ib1YR=Q0knj1}?x^aFDiy}X%J>e#-^lQ+x^srZ;i!=*aQY+l`y zB4P>p%R{3Lz|BL#!u1veB{+WAgj+}(9S-Uzq8;3p{YwcCY2=8PZvV|%dlLK2%*;IV zsI(d@b)NQ{R7epH%O-Sob}kJNN7<{b4Ou5Ln@$COi~v3KN8#2T<#t-2cu!V^z8e{N zJ-+euXgmCYi0e$_g9e8zoAxEfo(I_1ueS{vgY?^aVRMItHr=lpaizvg)o{4E+KoSF zIZiV3u+Yk?A|}qJ50b+TA=}#o#SJeIAif*N)~WWl6%91J>Q?75+9BeT!m%@TE*LST z#ar>opJ2hXyEf-N?%nNSQ>mRd88ihkpKxnkQ4_W%jNg);-_DQp@kM{7bGUaVuy=7@ z({iIouddeP=rF<%eNpWyA4w~nzzZ&|lM!JDMHF=Hf9@sl1gBbBnr;7_@?RckPJ{X} zzuAJqbou5;V;#S}a<(K~hN4h@hE1cYs@)XM8EOhWLlQuFKBT!y-~ zCd~)FTpZEVKQRsU+Zrm}6c1nvZ6JL*Gg(s!m!TzkqB8Tnod%x_a^`mS6BI1~7bS)CzBw>qZ6>JwjEnMzS zG4HE5>@&npo5Z+FbORy^tv`EWAypOI_Is-MU*y>@oXz%Nj4qBi@~GCZKPo2gUgh@0 z3M$x?!~7i_N5x=o$Y~ehus3w@7#|JJU<&@qS?f1arCjmXf2>#I+I5P8!q#6&+W+e8 zP+#1@qZIy`a$Xa1I={9S&!^A?@=>y+BAD#vXb+taFRpGU3%NNSj4eiGh1G*gmLcK< zRNJF9o8X$9GLx>H8nN4cD>#Vpe#m&6p38$M?P_kKfQPaP^x|__|ENI3)@|wQ zk8XcDXT!D!+G; zRR(2y?f&VfSl$C(EX?P(eX75onPW4I(#hd?CxlaFPaKUxxb-=cT6Xf|h~O<{GfsY5 zSM;3U0WN;9JV+w=H}CfJJ^6Q^CjC4lk1t%1Wev|Ev6FC4HXKln%wTRDo(5Ab4*2eddCC*Zp zp72@gLmyWCROHQ@H_z<$hjEt*hwT5UC$(=sIru&dZE4r|cY5Rv#9&)Q(0U49STS1c zy;p>VPF?yOvS}puTVCFJ@2vme8;2!1t8uuD7{kMCx;ye4D=Vt$ib3zVrap(LLP^eq zW&=d-^@K^&4E~)|kiliq;yh{&AY9W0J+W`El3gqCjQlQ5AC_`3>f0^oY~9aktP(P9 zs&L`9ugV%Adc8I(E1K%Ds-JJ8FBnwKVc`lss>{yaUYX0Oq^_>{XgbvE_~%98Gi{iR zWY`5WgE?`Vz8K=+pGW$wMB|i+#5d7@H_V4u{TDmkd0IP<%clID=oS;N>k>#`m&Ki^aalwRSqG$He1 z2P1cT`CgvLS`hq$JNt|5dpi2n99fHaooH-IoIjOArpJ{81el&yOd`Ksy?Ql(U|Yo@ zkmw0XubjvdLi@2zGDgdfZ(Tb(JFNPJx$dhgCH@wXmt`)$mb;krGULBfvi;vIwjcjp z*Pk=V?hPTNW!chu^Bbe~i*3o;tUg2A(SYM@d?#vP&=?r;(qf3Y&SaBw^rjMPoKW1+ z#x$BqBQ^UkC-_f0LZvIpS3r9nbl9m}_f*cY`L&5O_KSXvnZZ><3zInIyMspQcC#;j zAUXN0^?tAS`rS9~y1_rzDRuwG-xM-nXLW8~54fIJiF*@IHA?AY@I7XS*{><;;=JVp zZ@B!mD{Ux*YteHV4ldi1ZhMC?3jJhS>3;6|)!l^-{aRPj?FGipw~`-GrwzC`IDH)q z@`<$E1UNWTrB-!`*=&U0baiyNj61a8s_!FzAfEE`*ShU#F|+$^q;4-P^&)8KU~QMb0V1X6^zupVts2P6tR%#G~4gcRy|o*lmyHSjHqaws*L zBIPn-Q%p6Bc=01`OuK;L>eY+>jdm4Dn~B=PjZqczQM!oWN|z2<(z3RJp)=IY@$rOl z>4L6oEYN0FtJX2uuArX|5&l3x!1!j$A~Tx1{uROB7ROV}Z^ppioNiTGQIXpeYIMN> z04T7zgCFi7++ zG!#p0Y}~G)8;WdddT!t54~=eFB+=sR7d6RP@V)U6Znr+Rx7=ei)!Bt)HGW04>In+! zjIKpUX$NLEMlpmFP>yc9Z0#or^6A%kALwa0%OpN^A~RPp!5zp#+*kT>e+i*Fey|Q1 zl;AY&$fc(Vb&3k*>5s16tN59I-1M6@xQPK~P0pePlL~6WH^Vj>FoV8)E>i^wRdKx`cC}K^ zSou0*X;Crb!LwwqF-oC*1+y>~$WKAk^cVcye7HhG*qT;gQcY!N9G_aU(b0ymL0_Z* z08D*kIm*>b(LsCV=>knBp?O$7NNwiTm6dYKal6qF5$@&HfmUPY#eu?C#m46itDUZc zOu;uv&IBxx(=t@gyi?U7hA0YhJ_|bGr81d=gv!G9pTjmf1s4R_U9VVr5)$~##|z~k zE-ng_biwo>OfIsGwko5yT9_|e2{i+CVt*{mKVE%hpl07$XG(tRRV#Y&2rkPNr7L*k zc6hdmHi1UAcPeX5JDR9RL(s-cMn7`J2_H;_XMfxw?$rCqK% zg1fH9giPc#Pu7{N@M<1@dD`H9pv+utj)>Q%4`ouymqn0JVi9aHvgcUFN`xI}*fpzb z^(Pwqgm|StFEHx+Mt84MyMWHv!ffT|b1hEoAqq?Nt)WQz-#~l%9{r!E6_g1^SwVpx zjoTCJUe;{&zqV=!34E1{Iz8NuKW*LjA>2~qJ5|J$O5{^`ZeH!&^`YeAq(0xCRA}Y2 zRpCL$(*WQ-w4>z~=ksc6R@3vvm2lfAhzAszZv3HP|6hQH5h7k?^6}Qp7cT(qmfCfx z$&wyxrKg{X!M?Jv9y^a*IY6RqwLfLWpf|=DMSw{n6W%u+{UI{(hMqI!nMZ$)rHWIr z)~uk5P5ZzQN!NHd#d#>S+H^Y%m=`zKm6TXZ%n*Mb%h;T_yB!eZ(EV%_2+3bDTuNl* zGLdsX>YmBTvcv7s7mC8XR=S8TwrOqi*Ph3KC#I&RG7fkr3;M)gzsxvp4miq^C9gDV zpRoB5lCR6W3w1)KML62pMk6u3JO~dtJu+T#Z@swtuX1oJ@QdgNytu{uAEJxb{ej)K zo+`Ie?Z&^c{vz09v-!4(Bq!-$_6y{7$i zuhp+ud;0oM9L>ynq4w^vX@1?fxaLuAov+*Q=oJFb=-^xTWCCpMHn+uyvYZ5s_x|y0 zTc5to6r8OeZtt3z|N2uJed{?ltHaXU2xx-~y1ST2f8*R#GM%m`@!8IgPkKHqi5zPI z()RW*(igkfXBHL);l>Pm2=drkM^tTmoRHg1C}}qJWqr;sI*|*5LM{~!ZA#a$3AWUNdBR4UXYR45xU3Dm?Jp9!9Hk~lNF_FD zFY`k9IRb|dMAxNHyF*Pn<0&q26kL_`*tjRf`vKE(pJiOh1O5@g!5pO7X0C3j#&0}W zCEYUqe=+>OCiT5g6l7ie_!y%;UTe1(pSYhrP78eB;1XF&Pn1*2c0JX7F!E+5NG#+W z=r40g0EwIyaZA_3Q-sKb(p@g+kca40p2xy6y+`r17Tx5JmLKNcYv|G*#=3sp>^Y}n9>sP!Ia|z)_0~@aEQ67Ch9uWQjBqCI6 zz1^-)dgcyGK0j49T^6&b4|3iAylsU0A@C+Kiz2s5OMZRM0G#2#6}5)$Rs$a4IWR+@ z8|GalR^1EWIU*@E4R76f26&vBL$eF5xE)+8A#>%UNAG1!IzP1|f5>&W2-)ohval*{ zGq(!wZwxrErjVB!br&vKmQcKB!VG~#rprdWGJ$$}4G0K4edcwj;f|5^UdF7JT?jiQx`wKY(I^6OZj0Ovas%Ls2a~X^uCCIN_azBAsfMMq zLo$_bZbvKYc2gTU(&9Nzj}zr)htR9XsyB1=3pUP(LAqEu@3N9=(;5LJ@#D)j(^n za?$510Gp7+kSF-Ym9}okjrG%XbCZAla3siV{uip$;4TI*G?B3Y3gRY|M_3~qJB~!~ z9te87E`H?6RV~!aR27f&44j+|L7_arqs0Z7kHamO!%{MT%4bZ;IGcVNK4*|6I{v_7MrgOSx34Z0SOO0Qj{t|@Sudy3qu4PR zSw{t#mP)DL`#1#N)>K`A;LB7cJc9OU#-tuB$NmBisfb50vNgzpNvX=J7xb+kLXTfO z<5#UnIesHmWE{-b-a;{^@xniWujt^sw(O}2Eo!uMkzxcrZg0#qK0cnexLBn+ifh=M ziPSQ&we3x17df}=YY7~BVn)YG+4ks@vp3YN25u;lCLo+b zanHD`G1qupuwwm-wjs5#5sqYd;W?Cb>@&!T*D{`XNAz0}aAN;OW_(4z5r1kYiPPG- zm_Eb@X)ASlVX+iXvORKktF>Txr-*bVkBiZZ-9@z3VG%P$#=^z5ntN=?1?oHdvb;Wr z9qia%MVH?O5^~9hY<7-zDc|2DQOnEMsr&xDFV;fZD%3CWfq>h$4OVgj`D2tzT&QNP z!THuE!-?n9Z1qsAFN-gS$<3U4^!Ytj`vt%nd=lVu;tIoC<+{tL`c2GjAeLT%lzE+G zkW)N3TBqzytfRuOb(kd#zL(FeTUx87msx$GS0mdMaSR&a=z9$-9N_?~{024G#R zkG9BV(kA4+v9BZ|tL!rOY5JqhhePzw*}B3VetE#uhUGs{vSE^W!*8g9`S6bn2<_W| zQe*Xs55fE!LcSHKa@E5lAZXqc$`1MQUDvp%)R>us;3C%rruBnam6hMtmT)v-^CXm+ zKKFAKx5Hr(A=MNF7mw_x?y$VqLVJ09PRnk#a!0upQoKv2ez%7i!k#0LInRzouzoap z3j*OQ_$*{z2+5LezrFav;oaBgbtYqKI75Ye{Ka~8=`EH+WW&nw*n^XJvvlp{9=i!iQ!vKb%JE^*pA;w1RM(F@Z(v&pmOrA)CBt$OS z^O`BI21MvvO~DUQd!|Hc91-lNZ`v5c)?`qeNUqbP3_zXpDIpNdsF0lCGE+nbl=KKe z*dHHn_{iDLE=kbR4VLRPh{`i+ul4YGDka~ISMWHC!U$pUgr?l&{N^1ii&)22Rj`xF zb;0VLi8H{W%fz04fp*4lO@k2^3>0`&LNRUo-;LJSnCjmltNr3cc z{X1ICuZrXoc7_TA`eC>@C1uj1y2|`pBgLyW0-E*eb)0IJcP}jrHsK~0`pd>TUl9pw zT;5cM=V*sXGPQ_n?~~CRa(T>rNeweDtmaSKq68Z@Ms46BI>=|OF~)e_%kpuBbdwsD zO*3&N*4s}9kWLU|W*&h8y{?VD)qA@_8eKSi5B5=tkb98LNFnR#Dz{J&)|ve}T56aG zrD5jtzA(=fA{e%Rbo2We;(7C69y#9yD`5eW;xecBU-q>aqjdb+E*;v+oGQn+69Voy zvhM4zKF4-uR^d4A-N5 zts^Gbe$ zbmtm;{r&wZcI(|yhik~Srz3)&M}Y@s-TrW%vW7Q8g#hruTu*vuh;*w$+qzBH5L?P7 zGrrs3Sj`g|0|*`0dS$w%tIS~nd@)R22^Cz+O)^aGjXz9$uwg?*Sun9S@6O(zR zSn!H+fPpsqJ^7$z;nu7ByTV3RMl8Rlg#XWp6kGW%w5!OXt!9hvEH8D&4R=O#wBTpS zQ$@X`v_!HGR2Vk<9By)F&3#l0M!o^ruNG6~1>n$uew#L!&3i(SA{0W*Z8NfyG0rXa z#3`X;=veEA$I&*0@JSIAkFS@~SP`y@>2#ZZf{C8fiAx5=6cJFKtf1YojCkZaVj&V5 z4N6BMy*Su(_Em~H#J;}O`MUHu(8F;(OAlbX?;w(cOsoo8%FqrU1z~c+w6y*Cm!+ia%9JS2SMpR4B7<~29TLb#j#<)E_@5fD(9+i^e z=Y{gkyQk<$Lo(ym!Y&IxK!>dpfHX+5Rvx3f?_Q2D1%k3ds%OrW|B#GgnEbasQ;7pVDIY1bO) zu0xZ6W5VH+4xV~^b3lF40|q_Xl^fhW^2thYk2I(7C_6%Yf?ASp^%ZY77S7BV+;PFH zqb)OHRQb$9p=td^5ZSaZk8gi;Kn}^H3~YZG2dJHDw{`fKR`W?0GCfE`T@vs_1%Nua zcIv~75FaWS8>rRvD5#})^ySCxwpDESB->zKi(P%~drB?+=)JEC(_0X6qbNT`f#tI2ue z_=}WfJI%`!)f^gD5-ri>NO(2f&`lBd{h8^PXftWupTIalt=5s901$Zt4hjbNy`c_h zVLH8BD_wBx`QkFE*QqC?;Bx!CwiC>sqB?cR$=;qxJ-LM!b$(A zwJ~B>c8IT(_no%hWm~>R9-m!TxQI8hlNr@%9^939sLK3LXL^VZ1Zme#YFAkQ<{eQR zP-1|^tjIza`iPjL$4(u%IgIn=mf?c4h@E#==HC&`g%aaayA><Z~6gIBI*iy3r+YIcS3R z7jODGo&NM6@^0Cdf7I*$dteY)oxHonh-k*VGmu@O{(-tR7y+`)1NVDp&u6X8MLebk zC#rXc3z3R^Q^3<%wZG81XbnqBaE!~00z_gi=GIhA0sTuu!Dg1}L34brN7C&<*Yhs) za_^1-7PV%(r3zkUA)r-kGHK5WO?`b%a_HPQeMb!XI=i4<-G@;nUD@F)<#eqvHF|$E ziv7d>M`cUD=YH^AP^=u*!bGG?L;L3({#gIUB}@R`7e%k2R;My-6cs;Fy!zQwp9Gjm zoFm$+qo+QNsn$O1({UXuW*vZE&Qm`;B$y^R)#>73=AD>(zQp&afbR5_n3e?d%*u*J zgLPE(UY4bR?34V>-UCKI7n9Spn_dLm-s}Oo>H?Fvbk3psfNxEL+rEPpM73D zj5`h3=)OfU9@O#9%Xua&1`N;LpsavcJJLG>;x}VuQ`piDS5CUdhf7Ib97YYOxt%Ta z0bd9vJNlwX$sdF=$eN-UuneV>JGI9`Bm`U39teA;nh1NHxB~jgLY{jL5YFOhBx-+C zrqmB{L5q~TtjfyXCr9V6prkCv4nu1^G61Tm-4Pft=&X8u$2<9p$*Z%UUz$3VrKD`% zcG}$UY+2d5AHQ!KKGvz6hQ*3B#=3OV<`x|YT@-ZZ!9MK~{v76XFzR7l7BOo#yM7W! zx2X8utUOb#uSBMPx_V$Y?QWn1b;>d5F%{`X7cQZ@m&c-AV*9-WFa2K{JsL_9^jk)@lvAL^{r6DsoS zUOO@^wZ>1|hRt%mMzim>BrJ#6AP$bqcLrrl zS{{DTJ^~2&vnEIX8$Z9R5zrtDztq$bG{)}uQd=4U;QR^6 z`OpPRQfGZ8(68QO>DFDoJ6I$~Thjub3%A7@62|4q@{0^XTU=m>u1hap90jt6YXc^4idwnrM7Mwoy#fI2lwT8t|fZ@8>-z)aL zg6`$ygx0v&piVLea@)T76H6QVds_zrqu3ZT;Q_Y)OZd71uKNrg}O^6#f? zE~jy&WV$_Buary24Bem6Rc!2kF=`bJ-&$E2sWsPj^UX!xbfdCA?UoyjkVn`#I1IeO zS!gd2df_a+@YSRxMWZ5V07@!Vw6IzP?-X?@^eQ03*NMTr+eiL>YbW<@9Z8*Yh@~`Q z?dlPzJv*(P=4<)U_9tln=aKIu%GS$TD9|2;MN*DAQSJV?!Acr>c?tnbBjo7XpnlZS zWvN9$wqgPWRIsplt#_-tZcKQLqr#dJ>{Y=5x%zu&(af14D~-`QOOXN*U=)CPFpvF8 zsbL1n?O00+WxU+Il!`@_bWJ>^^r}{d&j*GGsU{(U^9I{L7wy{^Xwkw zty1FuL)tF4Es*OfAH=_ow)^BgCF=gB@It%sld}C2m^T`8J$~A3qS2t#tbj9yRtHk3 zGnfjx5U@aaC9*<3@WyTULXj;H(c;4vN0Ts1DPniL z6D67Ax%-?sMOasY6*_Z+AqNiTN;+0_9RM0f2j8+B)_2&kx54<<`PDv-OMO{Qc9^Imi4G)pLjHyw6MH zL%6+n7y1(2R4T3YeJ;Lf-c?tBUPV^O6oC;@LDGsDMkz7~GeyL*>t`Ry#XUm)R-MNT zEz10>yvTt<7o5#IrnP?&lz(_}qOzDF`oa3u?sjJFUgJ&7YfKObC1 zmT^Dd5%Jk!dHlFHF_MFQpv`u?S}tMuT}8F^`AwLeF2{j#^c=;g{rKo;j+%Uua3=6q zKrW7<_k+X5VK1%9B1oZf{Uf?k3K2Kj@>g3rs6qH}p}zOe-kD?91B)sD){x52*rV+o6g#|F=ORaxWHVPIGyurl9~4XDW4=qR937>* z5jB||bDZPzg4V(=1){yWu@_~&FIIF4D_%&A1;`iC;C=)x7 z2>BIL`X#i3{y$2qNJvcD9j9-pTyFhDVh0_BO}ewkP_Y}k3{+# zkikDYIRz~m20_DrHq0ASq?b(gp;)|Z+0=B;kdDTEjI?*HHOb& zqPE&O7{AdiHaqQi1?4u`7O<1Qu=a6yM$_+OkRCB35oLns?E%t9>nFX*eUXfXmJ0er%N03G+e6kPfLt2;t4uVd4Yz zE&-+@Jg>)v;U`pm_!4iiOdn{3TzgE8Me&N@d%!Wp?-6K_dFJOELAkG5OrM(4;vA^e z)$ijwMbE#Fp>RU7vXrP1UKkg*V>UvR+WP6vcqQEZ7IEVkHuHZ^^7oPcfqbTcu+Y+~ z6xV&)UGjA1ahaF2eRR_+O>3HRdIV;NIUDWvM&#tgWcwl6*dN@qYn1Ar;N7122kRN5 zUb7xu?qkQ6CM`P|eAQcWKg-QBtEvmPwLY49&K)6AfLVIz6~lW6MerDVTc3eT>;8-XhBd=bejQ5IHL0e*9z&PwUs> zLGKyplNpuF@=3DxQ=c}Dqoq!f{(5OF2A0Yw=HTz4I6psE^vT2S^*xw)vHwhG7K|bP z^xR%~9{X{j<^1M0Xf$*+H6zoAIwdPRJAPTP)cG_6vp>2>rP1yX&(y-=Mo27ugz0If z)NEw=B!@gW$eRd-OpWVgaFbYIt+z<@G{Y2AI@bc_#ozb-dbCDvyLvy?Bj921Lld0G z=2CI?OvMm+V63sU*q5?0lu4!J;WzQjYCM)RIjoH)!j(D0fG;)V4zG`GKz-19JWp%& zhgduXIVlH+>1ss$Lta8cgUESI6Gev!)f=@*lttV)_T#l@61@>cBh5g84Jnk4ln+;b?oau-3P9q2+pIZlCj5z$4`D6@G3( zK}KPrZeE&@RHR|_Q2!)nsd~-#av$XPSh%-`BARx38U76kws0Z_;UXm5+*_N4PVZM! zjf^}u)CY=VHDV#9p%KWo%=+WMk^Wt_6v5IXU)Qe=8|<(s$2saTDwHELQ3qUY;NGbG zjP{7l%3IPmZ;YPUoj62Sb0WPV4zA1|M$B2oOSP)y$TjxFv;n%w*wa}w!XW)6BoeRVmqq0@QTuzW1CB??sezLu9^h^O7-+K*XpSBdj;`HF(Q+6T&_od~r%=K~D`bk2s2~p1R zJs;m97BUB4V>L&q3c1=k!ADf0r>-LLGZ_QX^|9g?4n-*)`{uZGH2~+a=lEt-d#IL zSWf$=wC~eg8%b+*^kG2c*RB*1AZMR#km`u(FSm8i$1LXPr$M3c=t#6MepwC~axmVX zZHgw?7Vzv=q71C7`%)xx%1B7KI300|LdfH=TxTLj7OPgVopVu%<{X+_cKo-xtSTtHsY!USxT@oroej9m}+Xi z+UA-w?Kv{?fE_V_qETaO2KA*b3L8K%=(cRup6~Rjw4)HW8fYb3Rec!qLBgN3m>8dK zZ-E%6#>M(Id+F!!qpx0SzTFp3+|6CX-~)lW^WjB&BMCX|D#8_O=Xh6oWiP|c1TptZ zc$S@Iv%0{*?~t%abk=~Y0JQ)4?EYw*F(o|$3;2fn9##Hg|yOcM$X^9d7 z-dg_4%1LGo+jYmw=$`*9)w!r?-bmC0>Y~HKkCuZk5UjNw-&KERUdA?D$Rce?yvF8a z315Hc8pu^O@#RcYnC`5@C{0($8u5~s6(ReIcwFA)RvI8h)78pht!q!Q1C=N#OkQc> zXfd@)j|VF820d*CaavgJpGf~VQ+2>)rGzGSK&+k$0g>_7pK*m1@bymk zI#x{a+5wkM^J^}cs@GFW-~+cbA#@a;Tz;oxKDlS26$mI}v>PJA3D0k$YDV#PQ zz~wMKmS-D%Ja(FP>^yj{N7gBkv>M!zO{h;|nq)sVURvN2J|=4@hj^#Ix7B+AO!fS= zBpc{}E5?L17NlOITlkZZJ->#u17{T)PByhdvEiT2%54SLtHaTp z=H!tYq;wno+?Bmnz?-QUBm*{ejBy!4-@qB=*`ZH*rxF)uIes%&Z@;}I!D*w*W2Ktf z+soJ#5exY8SC=xY(_H;rbY<`@d6wu{jSK3=ip=d=eE`+{w9wcpQua*NW)=mPN-3C@ z_rXZw8cRcj3{v8O2q0QF$1;azN|j`#sAEN7gND4)R1J>)C-AV7&-SZx72I%a=3{Bi)q(2D*n6dSM z4@CxYc;Y02Q-Jt2itZ;y%6`e~evzSepB|dhBnvK)Pp*I0({J5=ueu$9C4w2OLr8dRJjAc-_MMJolDIu2Z6`}!giE_Hn0CFCi%6x4Q>9u>70GIxZJ_!qW7VcDU>_ry!+2x#Kew zJT^^*;qKN^Gq}Ry@!-|YrelMta|o?!o+J>Tsr*GqbDkfsTY;&Vi z8DfD-Re^@;z+N1pQP0Lh!k}4=hLE_@ZNiysNtRI2>8UX358@&1?0If-dcJBGY0Os^ zKR^i}mbkn)8;WcAb{*zoM;m*^<8&{1&!}D`y7--HkMZPSz5%`Y&hu{2HU|aaer2f` zGdx`nmc3wR3?sTW?Gt+YTy;C%KKU;S9$dQed@&dKMrKPpJoq2 zi3Q~FW+?eUOxRFRP&qCoOEqr<(>m$@Ku#}ZsJwHp>v=+7e@B!Sclj;ghT<06b39!Z z3213YcE^!;8EZTa%pK@jfST8?`1IJ8zDQ;6jWo8q1#GRI3}53Hz7Lb+Bs6CTsex^D zQafO`yh+cc^i3P$P{M)gE_TS?Oox=Z8liUHmOPMAIyy@i4UZ1bX>o z)xBX@(um@GIO6Q;a4~7iKUV9v`}l3njw)(gw^R|R>8%X4h=TO0O<(C^q>RsSrv12x z>LYRX*0WNuCX1!N%0Yz};5*;go3e;zWA=Dj3orF%1ao#M;~(2v;l_;uYH;Q2=put!G4 zVs0-ksOv<_JM&7zA4DlQKTcW!_!+1#A;~4P%(%Dy=0q>SVbZWu0oyHt&(6yK?wfxg zZlUsq3E1IdhnYvm#w-rUjmH+0B(92^aGTcWnmZVY2#@S14e=VuGZSn(>S2_HSUq{n6yRsbQzW6ysHQ~7Y&D+tl$lLwn&RI-EX8B9 zIWD9fuavMVLirt!KYm_9lIMHio$x2)EgZ!{Z+iS|Won^aEkh)aPC)p!+Md_S5t5a? zxJR*h^ZEzyf5Og-m+ldff{lm$(Ed%MS#a;_Kj$8=;Er>_bOi@q?5;MvpPFRt;aEVJ zyMF-v1074%>F{ABkc-uSmOWpgln<`SKQexsDch9HCAQfx{n|MAC$$AzFNDWFz#>*W}vHzYoHt+uEg5xFI45>`haS>^A zD}2u7^20pafQZ1sjIzGCa21njt#9|Z*WPycKb8gtnR`=P+wRz(fcfbv#?goseN*Gv z$!(bAICWL1G|(L8E!hRH@uv?y7#lGjdK>ajVxB*Kbz-qc%v)@4;-aO-6mrebR4FjV zU6#8C;$gVCU_22}VdX>_jW(Ek6+53%Bvy^Lk5)_Jqf`O>{rjn^Sd@^#Yl4!(%%Kr-m(i0s8)1Q$QFUr+eQJ4W5T5x3Yi7JI z%h(q_KeEheyXCcMfa!&Qk7GY-8hT4hvp>bUh{NP$58ttq&cQ*@kW|Ot1=z-ceoY)APK5=TUbhB%35dYV3 zqqewzlb>YtKh15EZYvk50_!d^fh5*ow(Y)T-vO+U!-Fl=|wm{Xv`xKA4$?2?6r`N@0o7cZI#UJBiX9}75(TQj-v zH8XL7R7YSrujFu-YrVySU;80^u(aId94%Dqu)0}c;AA|G( z!w4W-I}UGEP}9KNr<4ChUUNzypKq{~T0gkc(+7Jz$ygNuB}05gOw88G6XC8^oy)+J zyI%W@%^2W$Q7Y$-ue2+wFc9(`&84(ESf{;TS#A+iA+OhU6Oz@k`?IONC*ekSDVn|b z4R^BICM5F7G<$;wfcrkfS&J3he{4D8&vMw7)mTK-7zy+7J@%n*UFOLaJ`F0Ht7XsDxnb6gP3O?z=Gt2uWBx+*5~<*oIqL$* zY>3Hx-Biwr(a8-PlYTWC4dAHv&Yk~i4|O`4yxy@HH;vLTSe~3K_Rz>QB+rsi8E31i zsIIWWaNgSi66~-(ugKv+HvR5JvsFC2oHZf1>PWLp@V&XOKn@^@Au5b?)?EE`zw>xSgzzy;bOw z@tKjJUXSKCzt9yVC1G-$E~u+Zm6Z~hEjmw-?W31{gSs=+O8SC3K8E3F-d5Dz{mg0c zOa1`3s%mOf5qowW>(5&hOb}e;j&W!PouAzVhyd_=vBjQ`hDLc_wkZ8&sQGTi;+uRwwB9>tC@|1buv9fvg)E#nq^F_w5~G(Q(712p_cpdQCj6rt;@i= z*(@TjdRAd6P~!V2XrWG4shswxYN#A9+A~3CULAlz8?)Zv+R~oqh+YOPo|)0nJhhI6 zv2;g(gW22NPp5I7O|`WR%#D4y)nBis#4jQq!io5Mg?~+n$0GG&^QAizGQj^so|L{i zS$4T_b3lAn4cZD(ChPH%%U5if^iCew13QESJhn%HouCGjSjs+xL&^fv$uTH_>1>48 zDd@npg4s71ntu~Cm5kS20S`}2dJu(j?72p1ic02fZ|?{*^OysUR5C0;r(iLtc6N3~ zJ*iID%euKLfec!e9L8vX?jiU4qoM0#{q^;Ab~d*2ZAMA4>)mQQp+3%dP@LVxx9=mC z*&4UIhB8#N8$*dU6L*XK!>`@AkATf4i`|~f_C2;V#q){a75267%4p$pPR?c^{#bp* zxCvxmWniF1WcSc+hQ-84ia^fmPp@9RFE3~Q!mU8WSHP+=%@`s~TEsu}=SML@ZlC}#8JtAuC>UTc`Sq-K>B!Z@NcNtg+m^F4#r zvbV~E;xSQ)ogHFawh`-ofRUR*%25}as<6p(`b0_bK^!pI>gW3Zgm~_dUBTZQT`n?% zETj}=cZU*VS67KMTf`r~1w9^L+c+X2(JUBfPUflhv0(T{980&4;Z5z2ignuv>o+@} zMqy=BVJ#}xKMMN=Dzb>I3X5Snh?yqfC71`T_Uaoh?ChQ)ZGLJAXO#q9Uv3O#YV;LplUH8cC>k;}Pp05z_eRYMVR2;FJ`Y3}OKSKZD~b8_!^ULN zcJOtO{`yYA5b~3CiG_IknVX*Dv1&DdNNo~-72ja;@m#?D5y+M9%r{#D8aBZ)I4r<# z5JpjUEl5HvG(I^Q$6FFw6)!skWVt5wdygBOc5xH}eqXj1;xGO@FqOQQG$jW0yC{aP z+^mTRZa$i+1LoublhR%8d2Tx5H%3nK@rZx{A&u5(@dy5&Cjnl$pg*c&A1dP~$Ugw{ zA7H|c(-Ge)yIkU=f=+HPgvWegVc}_sJhbkr4XH1%MJ883Q{2z+Q@U7;%l)aS6GefT z(qPS*7y-h6wS~E$wywf<*bu^DJ-66kHlOG>h&&N~Gd3i&=(8eYVQ*pS`GN@CE;7e}%$Z`T2{{6bl#!Ot?EaI}@L1fkJ~5mGqC6)X7tPsv5Jk_z3g?(g<7+djyQ*JSY~N0j=lMlc?zT zZ`1~=a24@KdV5P-StzZdYv`%@gMJwPOvn#kQ-n{o0yQJHkC%-yT0qa2xvr}I3zA1( zLOewpwF$59jXBh4V&AjCl`b`H126ISDw#lfZ_C#@*qDL4-_LQE#@gtaZ5A8a{KbWT zqA=`Bpclr{0Pas=<4qt*Qd`go_fhj^n@xnNROPlaUuJFB#i5#=Z8F;K@@xv6>?RMD z+~|6L6ggT;+yw{Y(Dp9Fd2CN@fU}(0W-h}P^nwrCgNVm=_3>+G&UU@sV`*h#r1M%a zPblwpv4HCuWHz%Rwg$S92v~fJC|TEHh7G($+(ulgUZhx{j>HNbi=OC#c<-<=>jDZC zVWq7x;|4F4@8bx}7l;|@>RRhXM#%P5Z8I(cgVhH8WH^a<-Q~Gp3}q7Zx+iCwJ_DkP ze{j)$UI9&VqV)DrkGvk@TUUmg{zJ+Uy$R;Z*-B7L(`J7HpmA)i85`;4!k0^%+u&y8 zb_Q^!6GN3aI98dB;4BpCBo6B7mNERh!JnYKpPw?-T13S@UY(~GTA8buxk)rg$1vTZ z7aAe`Svw$rdw(ZMqrp{w9lQF<{5#!dU#5|I`UTJz-5U@X7zk?NeU`cLRvOdC;Hr1H zoo?~XrM5fUISu-z5WW`A&8?j+gPGeK;M*K+z3ysZ0Z&+~;CMS8W@=Ci%$F;gy*JSqUw)E*5GI^zF#Ha%@M zF&?wX?|OEod_w;@kx`$>?5qEQdJ8Hcq1%q+=-4{;hi!$_)>qeQY0fgh`ZlanCTm=#INjg5;eI$$`j~hUSWs@VH8-IN-y+Q+6H8>Z0D#*DUg1n zP)bvvPW7oH@XMtlky-{xb9poW3Po+pq$GiKe9jMHCG`VX+EbMn5;0Y5?9aoXGMfLy zs$bjoK@AeVi|=_4x%};)FA3I=8E96mY%?lZ?f1N39Iv;UU)nob>`;q>3kz+Cq>VQE zw0#uW$J&HUz%L?Gnfwi!yTaq@0ZF&H>J*^OG#?v(mJxmtgy*v^3sR+1G5T5zMjl`1b$<|cjjDhwS*}x<>92mu z$(E5#oE@o-P6lE;5;=}@@u_~9@kHbG>Nw1Wu29dp1O2;(dg<&66^e^&6?^l_!HKF* zR&}3-C8b&Vn#fmv2gMaev>S!9;&KUv)It67IEI0S%dXrX-1G1zYmZ$LZR7tv)fnn% zf@#=u`N%$k`74JpNGYltB%V+1Fn6xfYRa?q+c)3mERU8HKtid-l3kZJiy%>KXwUC~ zm_+PFmZcwyvn$`kHEX8L4Bg@&vukI~Z0auidxRvlI#0|a zHENE*Z+X8q0vI>>2}}o*`64wS8Ktrs;|S!qd!UoAW7FFp~UW-NuSDY z;7~!McK=+N6KuGb@{v9aDVdfB-vj7HP3WxXo_B1!^*6k-|C_&l3I>9*|3tH%B>$#k zesbN^P!`j9y+B^x>C!&dZ4$Swmt2Ls;CE&25DwhN8HJ!vHk>wJ5kI~LsEwB~Id7Jt zEgxv7tQ@IGB$={$4{w7w8!l5i?GT|<^5PpMwy~O6Ho6~SLsQNZR85u-zd9!(TPGG4 z=AYu3nB2Xk!*kpnvk4p|c;Q`3si0r)@;#lFn_HZT%tJ=vxSD=Z{JSm*YmcCT0zHX# zHSv7A2V1olDxyiZbv4aq=z)IygcGS9BFhU#_n+8U5Q{WyeEELchw*y?_wZoF-HL68 z0!(uM>&7mQ-`dFUPh&JDX6l$_5@G?MV+|Qz@dU5Sq##GX<(*W)e429zMD?bF6`z+d z7C?|d$*63)xrQ!Rl=lS1+CYm0F36{pM~pD9Zh&4k*-Epp z)2`Sf_6{~jissQ|2iRRic`22<$`t64CL59P1toY+m)o6duT%1o6`#(5^6ZMIF~p*V znNe|6E^Mw_BH8zH{lQ3W=$~vYI+r7OjW^y|Jjs_XB3$5tWO~A^7yK1co43r|o7RT&G2I5PL^O() zN^USJvI?G$_ zq^)!uqVh;J-2i65( zl-`o%0ho)&$)Xap-(B_+mp*xOs0Zgy68M!IfG0BMgsiUw`UnCMH0vB# zSpl@d(#qSMYmBHQ>Jd{B+cD zua-DHrZ!b19b_6Zd==t{fQ2@@@*hkp2&5C7g%RuX>BrVOryih3f4R3s0zI*zxuM#8 zdQ$l`Csvvdx|-s{!RYy7BU5`3%%!E`){9FpTUu{lXkG4zUg}%SCb#7Ppi#mUm*}*s zMQr)pJ5f#s(@dHR60>)Q(>N#TdR^!3{Lb!XdYQPEicvpsy#toO+hm^_mY1!Fx=0Iz*L(9w^Qki<`CZ@X7C5m?swxA7SXaCA!S_6$2^&4GnQiJnu78~G_`&tY zd2=~p#%7+7e6Dt4X?o4Z8fO3Z5uuV1Z0{Yv1as~ru-J%Ch}X~`5GE&ZQ#GO<{NJST z|DhX!pz@F&EBuTObS}`7qLqE`SBHM`zsHB{4{Pwf8FXjVJzhLW_mr-;diZT7w}@S6As z_%mLg5t)sg)u09)kw!t*tK)cN|6OY@x9ila1A#*T@>_ii`Ff9>Qf~4{m(zzTq@#n* z1%oTBoDN zbR6U@{1TmP#sDdr4(uPis~UHP)VX}8?63GQ7xEiyDnXiJ%E4aQ>omF3w$e1;I`N*M z>AJc&CeCXszUb@ba16qd%A*P=ZQW`1+zX@bWzJJ9Arq5_T7^F6R?ExlJlOTk&w&w> zncYj}7i;)TK;+&np{pBAt%82134jQ$f+rG&FG2!o z%EsmoQ6__-&n21XrbwA<$g+_UpPtr`;6=)nFfpS6ru#r$YjrwV;!O`D#$d+DjQhx{ zk9^9cg58E9NS8Cm@ij9gX<3Imgs>ni5@Lqamao0Ze z4aa9ur@dgVPjQr+j%B3qr|~oS3pI`^Ik&Ws*_wk9?qN+X4ph1urB)RCjNBgSxg{YV z?ebnz*Mt!?N+QybAu6_cx5|WxeTLJH&*;Usk2mOi#&VGvEwB6bflX@2$`3xEOH^G7 zD;;>Z$m^aTjiQuN_K{c$2}L((0jw^N0SGx&qT@?*FQh&y zvL)3mLSaiGNcMgg{E@fglz|&5b`8(#IQ@LpjaS*SxF{-IsNUIXU`%I|&*scnblG?QO+Oq4Z`23*V@A2aIFBTE}4hP}=br zvmrkB@*(S|yZXpPv|(=Ld(3OrRi-)s47xpFg%g`Db!1{DJ1Fb?Q=VgZtL$i@)&M|Z z){{$tBFTjI@<{A_dHh5%DTG5^lUR2n4m+q)cF^RI7iU_>O>zqY6}qnm%UoAqcb?{I z_mG=7WqW$M6l;_A+qsu3h(uhRybr8^H7J#+L}n{AXTnmiZv$XuV2QNA<@>W9u=330 z?6EtOOXd029y#7&U2qE?pN{Q%r|4RwtQalPP@g?tKE|H})x=SK`-t`=c}Guxx?9h> zFWUHpQQ1BM6bTbZ>b=TbBlFcdhV z7w%dTZEFo0?K)vASN@%F)qtpe9}2yzm+_1gSn4jJ3b;>6xxX`aC55|B{vw*b`^B0` zP01IRv+aSxIoPljc5|KQ0mps4v*j9Tk7r=ad;|kGW{vf1RS(K$0?db>OEpUlc~&sy44rnv=KQ5wQ**G| z;UXA2d(tC(sW`VJ)MNlhB^ksK0j5-$@o13wCk)8JSrXlH(m=e&R(D8hoc%f`@B<%(v3|HdTZ*y5BszC zQ~MhXnHVV<8|czPtMt!hKg|X&8E(DRk7pWnrI!Ufoo%!})xV*PRB@v%y|LlTHh28|ezf$}=Db0?VT?gtMleR!T^0xjXit6(yQc0YB|$Vhc8>oS*0_Kl&=3P|nFp@5SL6=8@t5Ix!n%_@C8@w|C=a%zOva{DGT zqvqy*TrDSxdJ}(Ct$&sb3B&~bN89-0NUOl}u4?U$ol1eahxmoJ0sBj@4;0L${lugy zF^k>NNa`uIb*NrX?vH1OaEPgtQvF&yLYH}t5{+VME+uEqY!{Y`$Q&1?TTc!_iHQ3L z?A$1$w!tDgOV`qPUd zU}c%=|I802NA}%yR?`E;-UYMgYYdi^DT8x3SGE}<^7NF-h4U9d zLCOWgZAl8a%hyx;*E;SfOhZ1|iT9Tlg}Jd`%{N{~;gt_KrH6!w4uL94mGco3O*E^* ztc}j!^bE5GmoBfWP9WwTl0m1Z}gRAglh<56h5%LfOX ze#GfD6$(%@>Q@KbSPRME<6+^kSM8@RyqJdcCJ_E7Je%N5$gHQw{o$Ywvb4)VYciiQ zgl>0l$G8W!J>~x*FhNhdAC2Z+#jCrj^{`dW<)|{ zG1HJA69dC~e?fiLd%98iK4;r6FV6-RIdzqtX!k85g~qXf=*be+2w}b?d$r!8PAn76 z4}}dK)G*MlwEmOss{8GkGd&TbK#$--Z~`lx4pYoEf!#XjBpTcVV;4i=#p_G=cxqq| zy!YMLCl_6nVudmDb+grnOQ#>wIvcfi&w~2?yC?clM<*vwPE!B~N7gdlN&tH-@a6uH zciytCDgrY><&*i5@G(%oY|NgE0LGor?+@K<7;~D9hIR$O_P@VP={UC&yLj`a_3jRq zN#o3~Z~abs8%I*%bZ62><4C~y4t)JV?+raMw_OY#G!tJoU)@`F!`goM{U)b#6?m-C z>m>fTnJhAe)X7xh44*gK4DZbF`u#MUKf*SWdmB3Fa&w<1L>X7YXTx3}cZQanE@7rl zo_o(cL)^BdPrT`7*uc)PNuL^^*y{7Gm2sg7`}smia#7!U-mdMKuMg1`qTulybnVA- zZI+)V{r9O%Q}{H*qv-?jKOv(Q@% zX)pVuUT)yS_US3lVC#YW@dXOuq}CG#$U*S<3PNIpc?hr$t3nnFLU`mzNqw>e(`A$D zG4l5>b|*8d%yHfN&3!)R>Dwz0r_D65yU2>gwc#>0oCdG7f3-VYKVo)ESsJDS|0OyYj#tC z+co&pEc*weReyM%)$vd62tIWdN6?ce&c@B;XMgbeAt=BI{Y~ArbU?*D(`KaROe0H4 zn(4#pERz~5%O|Uk(aVC2Ybsf@H*OAt071=NV;{zX+V6;|4N1EdH z@xFHUY_YMda=T^E*JHyR7ji z{Gamtn>z3k1`uKzDh1BpFJwh}6$DDVX}C2I@q@}Oj{|(a`j+Un_JsVi$s^##-`QVd z2>>p8Gvw^opQ&-rgV+0V%+V1Z-Ca~zVx|E_Y&kbLuXX3MT^^%0ak^w;)ep0@=BIpvs`gz01JCr`RM;M3Z(&5~pt zRSs5qOA9LtZD9$WAWnY!D+^gGLmSuf77vTjn_V&jD^{c zp8UJagM3n(`dX|)6nYL>a-=<4E~bmD9WBdttdc=MR((VW8V9*u)D5|oeAsM^ao`%O z+Zf{An^UP-me*giWOL%^D%H>_Pb0!a$tf`eq z@IMA@bdhnib3|>#(?=h0v?D!+K1Zy)6$-ymlQ#W`P8VTz?zGmY%!jbL@rdjU)(Lu(ZkQf`d0u9@7 zuNHlW^}&_T8>?^3^jlnWdyU5KPgVUrdydv8C(Bh}d!llh+=xH#U%dfUHCw7Yaj3?0cVnA{KI zfa?SOJ~Z(KfuP8z;v^=9q>d*Odb{4m0NHq#j4*?!4H02k^-32haKuf*29vqW0z>2j z3oM+DNnt>fQ<>SA2>myMA5RSsl4R z=87Kwse%onrfgJoBfwgU!kdk+dV{-9VTd30DLs-ioGJK_UrK&~=o&F#NsKLSa4R5b zRPzT@Wc?i69hpxAODzwF%7##tc1ev zK9mNS>3n96<{Zr1y9;UnrRo49=}Ptyeg_F;E;HT)I!{3!oNjMPjgSp#Uz127;3)n};2NVr6toJCVK&4J6N^B6QORXY<#IY) z@C&;pe!WqOgL_+Tc`Q|X#GAru^lea2EtX+f`bH}1-bAYv#woTzauLrZ^X5jM^CMqX zMNuDZ_U3#b^9 z*kM6%hdNlBNMQ4EWD_V>wC`4G>^jFOBP&A+`Arb$pdcb@(&}ygT;qHH|$wW?cDd=IsbojlnW$AWO|{(NKMs6CMBcqupSSg13A-< zo#yOz6i4NUIV^g);QR`UK@}KCfuMPf1RVS`Ktw8NQY`ZVX*!?nIUNhgu@4F6_bLN& zP@y`t)A_W>sp*>M;~Ch+K~aD+-t8Q~NBfIB{U!G8;NEcP@NB>$gL{_APcE$3@Updp zM79$pn+zdm!7a>Hkda)Ye=pYc_B%nXHTj^#5Qwk}&$8jkP-Nva@9WM)8MrD5^~_?tlrg5h=BY z7a=L&>g!J6z1>_NbiD*5Zr1mEX8?|c7S(G8QI6I%1ovh~p!%sl#F8kj0*X4zmUIIf zowlOCTz>%rVY2D47HvUnqydJ*(R|`4ig*{i%*e?c=$l4PZYkfb%jF&4F@38y&=|tq zL@`w1#p{?fO8+6n)MhOpJ)QVD)-b%$;ZEMVVPiWnfxY{8tg0aL-XP%fyPEDS2(R;N zZb(QFMmP8?6a5qiFJDHnv*#2_%&247)q~dC+r3yXp7=J2B#$TUpYHr8QQ8}73?xyP zZ*E!nZC39f6IE?)k06W*Xjm)AcE#p;BP1xd0mXuuxaH}qNe(Wr%N@;)wB0jA^-r7zYp#!$#1-?tyLKYfy-bKk z5XYBA9+zs2kA;n3$US zxL6U;CZ4aehusd|=IrfR&XIUJ5%S^X8w(}2-L;jpH;GipRe{cx0ky!(`{4fCIfB!q zVWaDM-ul30z}Unuuo*NZA?)M+dyjOB;0sW2iz z^^H!cE9}DA$R7GV`MAooH_53NE-lrlujd_sR@Ns7=)(yEEX>Pdm0DYfo?*DH&$jm3 z&F+Rr3vb;pz1n|uju=xyKIo^}T`e{RcnB}TH#(S{_ML{3{y*#g8cMaIm%r>krV>9wzUZyx#`d7P2@fYu>MxjP%66E|y*%<1Elsx=hMQ8HP=B!c#A zP%v~cB2Z!O?R`LapmM|~MDGBb8OT-d#I=f~coO{SBy4-xD}cygZpD&k_Pox6atN-- zl*ldVWj0s7(n6vE>K+s6A zPJ{x&<^z4-#hY@$T~gfdXtT6-iBjR&ak4LI$JRT$b84ad2`B&+wZ`gzA=!w@)VbD? zHW$!Gkhy<(MFnd}NCYZWOVt;()^nHO8yZ~O?KW_XgHSLMf4d|9ZJvTam81mVLl!zo z&cCgtK%K+@!Ogu`@jk{N_yw#20^4qpJ;ez{%MfDfMt!vXI>2(z2G$<)w%A`Y110Qq zHW6}DHz1~XCc@kb_|fp>w%h*rIKd)!lr(rk?IeWwI}JbZKp7LkY)=f#)E6f{bEl>~ zQi>zvXt&7Ldv1H5lKnTxQKP4wp#oI?{&bg^L4^R1L=4pX4C^NdpHy!~m}OVMk>a)? z42_h)UTs`!j{YS)8xfxEaqvqS8lQo?yZby5>y(z+$X89aJ-g>s`=yAMA$5jiCXcg! zgn|&4{%zqFtf5dGt~&VkBrc`g)bLLe&j65e8pgSWi-5UurEl%CSQYRIaOChB{?`EEhD4C z{18zu5{3&Y86HDROh;#;x_a?0*e$NSRKKmr++_AJpiSmW$YMea!+SuDYLrVPS;>^l zgn`C>Jt6=ii~VMAZL)u0kh!uDHCUiQpE{2E$F5G2>a~JNE752wFGrXc-u$421O0~| z{7DgVL&?MH!g<^)5vXSf5GHC>o0})591bOOIrOBg0}YdA@;_@e<8qR3Ns|TdIqEzm zyn`!hyU7D-c*I`0HFjSD0~hJJbBP-~rF3GPkO5Qg~; zKw6+~dX>|C{E`k5RE+aYr}pNq1Iq0G?^%{a@xFk;nyBRXUo+7|BvjM%U8(l|2vM;C z#G^wFZWygAW2yox$x!0xwR?gPe>1O@7#isL$=b$@RPhYPQzJ+H5-3*)m51Y4yp-Q` zHlCgGblQwGXR6#;rf&|!#$Mx|Qb)APh02BjbPqF%zZgQbQ&(ZqcUY)jzmKbcjI4BV3CU)0yOhf_MBb{42~m$I=c=R?7t(0uY?~_#BYe|E39qr zDWtMeRFJjFBTSWO^)(35gPyfS^(uatZPfWaH0D+@qZ@U2@WiOo>C*&Rp@Dy&Ye@k> z^SMq&S z`Kzj_aW`^RfY}hb0$KM0}Rm=cbuwxd1JJZ5-OGtf%{x2|i<*lt|nZ$=v=W!Sl zVm<$+hXmi_)9ioTY)p{iU1YCiuo;&PMa;2eWu!!7vcSA}QsOZwyaNsI$$bRbshu%+ zt_hxRS@nwG-nt%5Y*TSaTjLJWB>emh{Qc3wvo%VWv*$1c#w1yaL`Ae%E7%t8FM$LP z#RUV%OZ>!jxLhy~u<*Zd<`V>@2cPRAkEci9;B7DS4J^ML?;rQ@zhPYP4h{fdkBI+U zPeC6+^8Aj0$FK4-npB=0!_srAfct?WlEfp+X>(q8iKMXaC?a~}!HE#__|wSK|BtS> z42yG3x<+vc0fM``TX1&^Zo%E%-2w^j0fGkz?(XjH?(Xh>p3KbNlRfWu_|?#Tadr1C zRcqC%RqgvfKG!ah1DpBoej!>>sG*X~-IEKwbh0dB>0Dt8qeNrR_qh8j zAZ6X80=on>FtDRqN;mKSa@6P-0POu$n5cA!!SOoL{)#X_sA(r4Cj=~9e9w>V8!aWf z$y&ZL_@E1Z4*3rFcWe?|tvuNsL_#-rti=PsyFjgudP!KMJ25UVpr;OK-(lf}#hM)S zd;XpnPP@|d7OqRC@^IQh0iW#Vbim{Tuwj6nEVao_8<&rnahwJKJ^U|H5(#koGFMtM z7;TcV+NoBuxEg>3hQC0R=D*L)v21&z54a5c$nh)cJpUhW^j~4o2=%uN8CCNBZ%$f* zRK87z^>B4`>vNTl>2sNGvzYqBZCc+mD!@y{_V&2R#B@SJ81F^ma*yFq5-V|aN3Qe% z)&@7XD}8HizBJ5aBW!7l+`FS`pUlr*+jZNJRhMf?{O+jw#nywq$I^UfC@3iCkmK~d zfbyl$eZ{;z=q_0IhL)q#s5M?MUqC2&({Am9_zYJ2dT`$gU6Nl@#vWh13s z|8~AmcQ#S5*!5Rs=I`V5`@uQ^xt_7Q(zz4O+&%2a!PR%f(m7-)IMM8aUI8Bm#S%8R zd`mRyw)6zXfy|rF@de`hO5vAV{4~w;1VCo_Q?u|LlHD=xloD+|K!mxZ;AeW&Ts~g| z(t>yk9c-Ww3b|bS06m;qRqL5CV1sYyuro;JBJqOmHU?yr^0j`DJ@sDC2pPNtmzz79 z2U29sS0DSOTfFk){q+pi(yi-klk#%DCdxzV*}?sBY5i6F{e20XmmvuxceLGxY`XvF zgRPgLjE^PG^)|A#MF7&@vd=YXfNbgM9&a;Gxky7+Ts&+p+G_3>ei^6BrNRDE6yQq( zkjh1#W3W!cl07rwLkmaf9|<^ZG6B0DEIN%}e%f)5!{9A_zD)NffHrzcQ0>PgAPri` zBx;h0`k7g3qu$1K=D$dOUUz9c2)Z8S2I~_qW!$55j4iE4k#o1(a@hhardE1!#$pDQ zL$6%0BG#5~tZ#@zjYV1h?+o$3`PRP^F98Ii@#R+kZ{N7y%72Sf7a^L(sFEI$Q#oDZ zGYo)m>nrUrL%joGeA7}#iLu6kX=k#|IF@koZMxD-x4%BKHFM{pJ?x7L^Z9H|PD!MT zgWd7^1`E(iyy$v<4dmBW@#v9k50^Xobq7erv=)U0JG;D;XzE-ZT3Oh^?km*8@q|Cc z31%rgh{o{c00F)x!RsEpUGl!M8m)M zx(Io^nm9h?_ktXl{(}!5jLwus-Q3{j5f8A>ZC?3j+fv2ew%?J0+RY&1J9h|+s46{R`+|F;St{#GIaS=SP5 zi|n=U{yI+4@vt1+Y{jZI&k-qz^W+E16FDo_`H{@&{OMunr%W*-%1n4McYAOk)hEdW?<>^%3RxEW6nL#D7>wp%i5 zxs=yYNAnZlfrs%zt1M|kk(l_E4Gm6-&tS8aeD{0?!{fc8Ahq#M5=IALk*7tnWTck>ATjI+)Dqzm ztnfLf1I!hXfmRR#J{LazWbV4c-@0OQYi8p-c_*4sxa^n=!Ix#!1(ikBZrZBX9Z;=H_!p1QP%S_~AFmL6BUwvF4;QvkGIVE^$If`Iv^3sBp}zdXR} zi~jwOeEmRF{4`tbth#bn?iZh(esbvL@3FP{4QxFvex8%PVajNO$vHFzg7QQMzAcYI zFgIY0V%g1X%s!^ZpD}xcg+$3nG?SHD-oI11SU_blDWftsn6A)v! zU1=69JGH*PJiJVnH1s@M^!&2+478!bzrSv5s@oocU|w)N_6UH_`G4c{3?iPtgIYhKe4WbDwQ1x_e*UuD5r3)-{193_kb6NkF^>Id#C{xcvdqKNc_z`uOY76iA#+lYuAL zEhlV``|-P=oo8of4LrzGc?l+~*1qSBp~xBaczE<=P|q`^45RRHaaDZ;*Z9gWH!`mZ zwJDCtG~G4FzdkUfb{qoa9c><0faP+?bqFIF3ZNyqIp${t1Xokz6g7II5*<`Ep;Dza zH00{@^?fhT1GEK!t8sZ{fKZa5YLL*IUdg0Xqn4Se=gWu&qh}zXAZYsfXI}EJ$CSvW z7k_yH{7zP)Szte5uEecg0i{0bxBF=ezg6|Ma(9NF@3_Y}(?yy!LJhIXpZyoSUU8lG zR#B2`496}!wfdw@7#+;!fIh1#O@Btu-e^D>2voLuudh|T)1YjmcO!CV(`bEq#0pw_ z4{x}8KWPoC9pz1Af^#RvpqcBVu!#J8aXQ%lXuK;`!OmAqia!{w@&o0v!E1tJ@LCU^ zfYNTRW=@2fN6;N?vIHl1;P?T@WF(GKQd+;gf~1Kh!=R(uDdcy9X=-EY!S4~Vrt8;X z9qgtuc%&?S0fz?H*o^UKrIL>D*Ec4gu-a39sRzH^gq2MZ-NuQlnV$D^n%_1!bzsiC zA9<9xl4V5tUs(5dLW)Y^_Yu5Dz2h(M2>}64Fn~7ny}vh~sHB^mqD<5~6|x1CV4;sn z@GOVs_$XH-%!esIT|xO18(YqB=s$KgYtCsN;uJ%T#sb)uPa|Or0#} zXuH&=C#UCI-6LzPfjpXNn3GIHxnwFgDa}}E=@F2h%kOzSq5^U<5##Or6;KuwIPj;& z1=dmu`N`kS8Ho(|b472;K7jQ)^o=zad%kX0W|9gSixT)tv%UQVgKitI>nm5z7q3_3 z9-O%4DoxABa0sd8FMwYJ2eMo7T={D#M_w~F07krkZ5@iKN~yWJnB^bwwI&sxF!X-d zf~}; zbWD=(Qt#-@MaNc9b=YQ4Xa+xOzpd$t>Y4c!%1{x(4Zpye;WOnQ4tT$bnF@&g3~f}% ztW6{U*}WqKtgWU6{OBum#Kmc%O4({Ue`P^UOdyX?L!tnXDmVO$M*gzyI{gtLs7uNZ-(SIt7Q4NmJ1YmNT$v^uTMabrGn2`<%UA)BI9-FL6T z_c=p#UN2w(pRtaSLv+SEb$PI}5pR%`%&!I4Pd52uSpC)aU568v`S2 z$-TP~@mn}NHalYs_ta|x=D}aKyt@huzV{NIXsM5MbO}xysW672qnUkg)MmDweggyr zsFk#~JYJLgcu!6mR(op|A*;TJS7pCS8yTS;b(?hZ za=WH*cpVe?ynCvZy~CIMSQ^s+IvAhTrtVbZ{BD~;ez4wXq;Lk$Dg})p;o?n=%JJr! zQa7lz!mb?26X-871msE0buPSI+RfOXut~ZMFc30!gZ^t?@%M%62O$3Ekh{IfHfguN zv+(|0h}}_K3*}-)H0NyB7bOpf!W#76nIXA>;!5?Tme(+E>iblx89T`S5OgZY&(FNR_^>&sL%6 zwyV`U%H*}+cGbVjbl(6wNvMfU<^VjNQ)Cd3J9LZmA>i@e-Pv+2$h|t>3c+X(+)+t< z*DH5=-A9GWg!2DHc)s?wP0D(4G!X{3mfbdm_IP@-bEL-pU`?kwZP!~*B~y%jVX~k> zj%t7{vNOit@B7Xd9he5L+vc5f1iTRFc`ClvQOYsMX;9)B8C1EyT2i$T75mY8@{1yL zL;5K5Z_xhxi}A;m^;;F^_px)XYmK`*=B9$~0hk}B0j3X#?oKuvHEzv-{>DrCaI9g@ zSMmq}IR+)!;jDwDaeFLh!a@ZZjN#JK=+~i$eZcQF&B{{8&1td@)F|0eFd4BGV5)%y z)&{39?^*Q6f^RFqJZRA728N?@M$BZeRtt)+QW*u0#j=AX#&pl zMv>G-*Eih7$Efpr0$>AglX52;zP)eZ;aN0?3qyN(W6*T!Ez4OpEe&xd3bTS1N>xK{ zVBx=7n297~9(2*3Y5QsE=}%F#b@})`9)CzhyU{1j4Ne#z=7+dn^-K5;K1a~B3Nh;= zIBz;BXwiW-T}N-iwf}wL{sDXxiKC@BSaZzB|0de~{QjliY;v=JLvv6X*I5k5hI@}U zf1tHaUKmWyOI<>xMd>@vWLxJlk`uITnVfhG^%og|uT>}Fn&kOTLcg0qk zq*E;+*b%*1hf1~BY_q$F7Ke7My^yb;2K28CQ-4?Wf~_E{;f8ttIdP3JqY`Dx@QglQ zL4cJ*!`P&$-#qTIOqVl0S%R|F+WV!ssg5CQ70w~o%p(8nT%A$hNzTH9u463sN)AygF%I|7rFys0(ah;jtWtY3nQg-wg&nEh@o zV6g!-VVc$@5y@KyTVm8Q0G!gmWeHLS@eYHa@pFENp!b!<kvVq#j=sa6)!?3&D?>lF_AQ9;kxx->& zv>QWeY4%XR$}Fe(Z-)7=lj{VS%dh&cuCHx7>b>-?07?ht(h;7aW(M^ zbtCl(p#z$|+7hiJp!aGQ=vh}vlfG5zy|X@e#%Nq^;8>7f3Sx-ST_D%iTs$}n}iU9$5m|*3f+)-_s#eHo7>fBiCK6YL zg&&^8=CjjmY_5a)A4dA`-0AlNfXoBDR($Wfdb0ncb`zK-)?2E_*LFI@a(lc4dgg&d z<$3#{=1Nv$2_$mV=EXU>rf66kkzWESYmdoxsp@_B=KOkfzf>W7%wp zk;$MD3}=v^K2p~=bgI_HQI2pdo=MllHIcD`>Erb8*L%f1j&-HiCQu(H<$h`%R2KN6 z)4j3VcLUBy7{#mAI?cp6xIp0GHaQQzwp}0dG&>m2^09WjXe5h}etm$~`x^xRy9$I3 zQCDn&m2%ujmKuGQzNBPa^V3K12r zP_?Gn+R(ACb|fMc&E)Z|$`xHtg-#IH7m65+m^xww_%YD`i8+rq|8?c7_Iv~(n2Re3 zB_%nGBC%BB&1+*l?H-hVX878JS5;M&=k4<-923(b07CBMh%WPH=z zN;(7-r0;9$dBR2L)!tN@Rt0LR51veN`_T7TP9+=gyi^I29q@QfgM~r({JSzh3ylhI zF{ktj)6#fdp3G>y_(lD86k+)CAk6gX1o7r{R_Q_DPzKGPYsO-CAWi%s`jz+T5gID| z$H(1BOQ8egy(tQ)>qgxip{y{ShxZ4Ct|2pD_V>+bLxfZ(XFM}|K7bMnKm>yz5rT>M z!Tjr&S1d48lk>4=DUT0-J>?4xddy7xhD%?ob&Mn?B1R-PpC^}5X);onlXEe31glJ` zhKtdp?c_vjbGOkY&dRECqNS0y5v?Z z++wkwyi}KoW#r)usXZnpUAyru@t|$MF}EtO$!n`n*RL(cf1~ZL{NOo6xym?&&qm70 zY1n4T+o}K>&nkxd+?J;syc<~un%73W1M(+Xqo9> zg=?*-^Wl;UuW_lG@z{kABXln!@1IGHjKHuDv5R);;GPC~GIsmtZIRdp-YNT>tk%ezB%T(HJsOlK=VSglWj_ zg1#lG*x#VIB>cZwIr8)N3S7=}#FXWwKy&|;7*6ch8u=xP^qKJygr00>gvwU901Ydv_#|+5yr^i({4`zx^J0X~A zFUc!Q#5mjRGki~K^ueMuI}RtEOgnA4?PITigRTX#e}*gwCoOp-W?$Zo+a;y~bhBjV z8R^e-Vef6f%;!kwF5NOawB#3TRV6{Ev5q*a~ zJ1rsVAps%U?6vbUc&7r$7gl-PhrZL6B=F+JqsQ>rN+tWL=L-WSI$_V2gGk zMVv1miGKc!8CtY|u#e85@rgC1kl2vy@jN)VPmZ7s&O2%YhP$JwT@aM)mcQueDv?2_ ztl9h7%h1r!A%Cae)CkBbJHD@RQFCFdKYm$*>g(+>8Hr_0R4o;Pe0R7#kaD@R;dsB@ zQ#D>bPMcJ119--y2W-1+hGcPB9mb_507m=Cuh(>y@_KPj@$j2WqG>41TvLJz=I~Y zOw;{sSKB!g#yALYwfNapB8*1bGSzDg(0nlJ<6D0!!`lv__)RR`p*4SDVq&teuqbdI z3Rc=|;lJ`gLP3G2Ad)!Tok8;SO~b{&xZ8f1`nk>yG>lFT7N}Iq$0$F1{UZFO#oKE& zs6|}g2*C4cJY1r1Xy1aUzrsOnsPUDxGjfq@sIWG*M$)7Exhry{4*#{-f82%ZQ6D7I z5I3V}V{D}Uc}oBmQDTY%V@C$P8Zuu?fNxggdY)(XEWh+ zy1%GMkLqwmy(rs+1hF0gZIBtS{egY)RHz8ti)Bl-Ay~HBz0XGa3?-JC4i+zb1OQo zjKb(py^3%)^dxw7v(gp#aj|h)SB<`WR!`?_@nO}NoN7NeGtMGG` zX_W5MgNT?|VSc{esAA6G?zHlK(N^D?1j`@G3WPX+-&jAldIDHTR9=!+?qiPfz43{z zeh$LL_J{E-g(;#JsCa5sq3NM`g&@bV4RDMyZLvX+5K!45_R_g>Tf{5u_#(+hQkiY; z&n@n0R|i|?i0Aa+yf1<+PhA~7tjPL3-)3aAt`8b23}OPbc%%<}F2qH0s0r zXu~metYI5BHjc@cV|=9M?#72aGMGruoj>MxOSeQF<@h1U&i(irw)7Nnyc_%0cD3gQ zd1G+Ia_R1@(d)%R%PV#MYdwaeQWbt$V<>_21PC*7;xSkK-tYIYuh@^-6&gxhSHF~L3>M?e6%J|0JH ziEGhXqqf#7$-zLa(!tCPa|=p(8}fTiAKx8UN(2pCsTdL5!fG9nE&JJ`OMk~GAoasV zh-T&Ry~)}c9$<_+F~E*iuQchI<>j7bs3=uxNhrnwxy51a=VuhCg|qa01|+shV-ei7 zdZ0iu8COFDkJP^oAtb!U$$`)1-20IEVx{A4p3^3`6k^>71|Dv!B~*)&{d5_dbp|@*Rm1i` zn~uTc*-(5MUc?bbQmkfjyeZVdIm+snpd-_IEECFB8U<&MJ4o)3;(F(~Ig8b+i5Cvh zV>f*KUysrAwW$saWa`stO+RsYUlcFvaJb}-X1K5S69u|p9^yz@t@~)Tu6n%G$`J6A z8lMakA)QzFht_%NM~&JOul@3P7DelmgRRo8MHU z)sKyteR$F7wTcx2cD1>Z(I^S|r6Z3iOa0jo`X|5D-TM1(Ehu#oiXg6eXG z;n6Ldr>h<{1dd);nR!63mBvCFfg6w4E!_rg4La8B|T>byP2iH70`!O_s~xO=oo_R;8Q+ftLEP>Q|&X@_4mZ z{BfK;)>2IB&&&Durr_{z3A%H9I8w86KcJ9J<7aW|%ts_ZnS$SDA2t1D8*GaOESu@x z%7yP%QC+e4#$fza!eDw--@%EF@KXbi-OgR-hV!zlwl*Fp_$uG31YJjy72b*S<>4nR zUVqE_a{aypNE(!Em=B>KcVXWe8^WxCyc(wfEHFdVeK>TeF|juJ|>2j<#WnDfc+wtHB1k-n^%1MX%5 zp_d5DimLO~Q&2U;N%%&E-^(D<@#XMI+Q5~FRRR~HTWi@oXDagw57R1f-&&4kgulSJB#2;%n+rjg;H0v~!5@`U^KnJl`E z8rXjQBjvvbI!%VxquRhU{^SJy6?*{|pc^qf*0v#oqm+#YCs=6f3r9Lt$dHW7D1!Btxn&^`7dc)CdkZL z1$+G+*5z>ZvnE~mOMz+TR_N(1ZU{R)4WNPj`TZRUUGCw6HBu^jG8NR?S*TTlqYYnY zrK51UL$_V(yZyDq*tpS6Kq6k4pvm;1T-MkFs!Ycsi4VNK=~C!jIjn|ud!Nc$bM@N@ z8+75AuDOSU4vOmyHESINcGUoYfy?va`PlesVW<({4Q4hQ;PS>)s&`5HaMkC2uA*;! zi&P!U98Ctb1)~^CiUfZ#`|>azl0AycmzyjTfAhM&78JR7feG#v_z09is@DVnm|>;C z^K)jCld|8B9Dyb(jc+4%3oHGao`=|AIXr~67X%uiH3E;cw>nTf@T`hE_?Hvy3Z6iw z&&##FWUb>l(FPOLJT1Y5FN}f7y>LaiD`f#FJl{kAK%LV-Qhz^0w1v%e{j1KZ zu4VJDZ8;;OZmAX>;^*;z${^YN9L;pUTXHBFYY60Ohu zG1<_jBWZ+n8gfN3m~Tdyg!$Hw=YN%4HjU2y4=R?+=`~9 z818^zbUK@G-Y;HwwyLcfERieXkrEaG|-lDKaM(suye~V+8)N?`T&+u=t;qO z)_L)ZC5p-K%to)wC*bjV&j#y$YP}lJ!qdqP8>YL{prcVK{|aeVj60?WR>CI*a>}I6 zb|C>e=;!vqyg5qr0GNr*DI$8Y7PD(8n(pU!5D%g!^X(U^ap#dgQZ>2nrkPN2bAakp@P)zMUzKDn;Ssv`2n=4P*EHLK4jc03)1 zE<-z`)`{G#)M{HSe#6&atL9-3Qj?Rfa)&LJvTCGe0|mh9cV*MLS*%|_8I`qvD87o$ z)*c&Od9eEpj>zq%FUF4_;x^tSi69ux_5y-#j#q2OrX?MfV!JjU!YD|@?nL#6QAfvy zqc3-ZyFlxp?{2MJ$7f}c<4L*4f)FoQOw{q6E8|aM&DIyh+Pj1i=NhV1B^vYpVuAie z{6*u#7|9}r{bc5?0Ptt*smbbl)J6{|ck8srq>VardTtD>2FML@_-y6$+F35rob@ofe!0THm0V3x8Xrr{@W*xhs& zoq(B#;_#SEg11_z=VX2fi+k3>;+!vCZRE4RogRH@yt*O0TKhpnVr=`-aKb7*wG4o4 zEjX^{nBbV%^&(-tBatVL>#+91Ns5@G zvBiJw5YLej@aTM4Q-azgsrCuE8NN_@Fk##MIL$uaY~USqmU0~oz0L%?FX|&wD+A~L zxlQnM!|#vARvUDUS8sDc-G4kblQP#Dru&BTM)W#=fbuDzX}COvLxnnsZ*$4n%;J6)`1pNtjfV>XBzSfjE zkel^I8Fke21pWr45UF&res{)_z#jh|#cdHLjmKR<0woDUWDWJzv{fUAx<7erg^#JC zqCbpC_Deh=9=NL^?DrpV_o@G>>R3PBVkxcsGo z>)=n(toQ2JJgACnxV#&I5N~j5bQ@D_w)543bmpXN$R}I#yj2PV$ulN1z2;lwVK!K9 z?PAvi@A_b9`1}U)hSU6M-+O7O3kqv*jnDFumn_=TmKUh!G`@_2KQ?*{`_!HA;jSg( zr9i>j7;q zW;@6Y(=4slM2WCLVDC(d?F2*u zGw)wvTpFqV{PXWXe(<$jU;JqbqjfhR@#X4a2%{VGeCv1_q{(u~gs%8=EzcYyXv0G7 zajm{X`f>w1ZAF6S4lcJv(Jn*NTe`M21DJdD}4@O^(J7dAu?&7`^SMaZFyOFTDUBa|5!YALj*AYmD%X zj_nM~6qY%*h3R$LIu-W?uL4p!fF^H^iVe#I?gf4zJc~8Hm?c5T0$uTJvne0U39-KD zMNMIhG8+Rs2MLAvblt(QT`;|tJWYe>GuTtdQLXiID;HA&b ziFfd;<^~H`EuuRkGIk@9ciBhxt{#tEUW;sfubS>3p048HSqn*bv$mi<0FxW#cl0Jx z%@>p_v2d{Tp(H84n0*4^8w|t;Qomz+z{1*>^npMmx5j*?LOW#?7b(z)57O4M z&DLr^VCif1dKji#gMfy|oP23dH+=MfHT~v>fiYXE>+vKT`;+kbo9$AgQrPavU_xp1 zo@Yng{e69+&KNKo*}$^y4#?^GaD_h4Ki#CLewL9dh9$WbYCZEX9c>=CFQGhyzm9mk z#hU!34@W{Wt=S)F+U$9@`x?MauPu9ToMN^Ag#%~cpS<{b1`(sfAjgc=!ZshmsJfVl zXm4+?l*f&eQ(ejGuLNz6V3ZzY0v-p`-h2(tVS{obLQ8BMY;}gu->NRnvRXTKwU>6~ z(t7B|v>W=vG!3BCGhnN_?XOm~J!R8VqD6!^)f-)&pe!(%htMOEZ%%AR_x8Xl8Lc#0 zYZZbLV64`q6xttdG-fQc+PqN)Y1qpkgCN5Ct{cc0Vne<^*eTh0V+Zc$S~mpAFh6(o zMdcs$ck#H{K1>>`jml9jw#R>P!JS9JMj!l`4IL%$4zVI}Xof~D?c-pq`xw|4u;a9- z+x_YIJ1GbwI{`(+5M&u}-hhWGlWcMdBQFzqVF4Qllm-7m))W_;9jJb`*s=Z}>j2;g73Ycoj5i=HhhNd5AIu!p(shN7SMV}K#T;He8;^X9bKYzN#${R^96_I zk~W?}sZ=2iit^gOJOYsbsCfD;=k(f2fg#=F0OIO4>zl@I4$51YhMFs;j@v7%4wqr@>GFRpZj! zmZLfA@CSF7@bh!VF^W?R`t(}Ic|`F9GvAxVuQY07b45jVdtjqQj(c*U!h~u?8byf< ze#tXO#DY-Cg0^4`+I&@pbL8I*^b-1)iLiwJkrDpp;Q$s6;{UHW!2feJvbcoE;#a0k7%GT%kTA^9`sa87@ zuVu1nPkBZ@gZzGcpeb6!S9ALc}k=GqwP)aF!zLK#El|@mb!l#YnCRW)eD1YBPmD< zK}M!YQrZTW5_v=SM;c?_fAMfqbt}YY(_V5lN#Nl1y$-B8=8};}b*QpRg zMjhs`7E>%!1tlYo>vs^txmooglHM{E89p7O2jYkE&ESlMazUw?a+R2j)8*m%>3I|u z4klC8r=(UB3V$Uef8rG`@>!bbv|nbK#LrLUUo1odP(2dRM^g7(E^f0e@1jm?ca%b| zqCY_}BlwF)5zuN_kh!l$lCQ|(HE}x3cTeUFR%i*`OENJs9!?hIt#q(V<dB(`ZSW z=0K9efHI}(E35CT?;8zod4imWc=Ni)@zYd;HngZ@Y&Q`<{oZJfcAHBJ{R!9RS(&N!&WoV7TW&AxlSj?r zAdBQQYq9p(cEgm}R`@Fbk4CWBB#)oyv9K_y_=TgYT~b^Z7_Vs9OZ6Gvp2n-6*}2Gf zUxnG(DlNVahAJX52N+jj48)!-b@6;<-_^ViW@<0x&%&M;AQ2YsW(@SEnOJu^#Jv56 zRIcz#Pk*3VLMK={>lZmv{|^Y4<3+*(9QhKdl8B?*V9#n~W;7aQ!ILkm=xJ`dY?#X} zu3fA8{4QZ+y=JEirX=a%9c;f*v@~jB970@F58~N6Xf{%d-ypbFNO@M!X(=#GW*eJ+ zo*VZA2Do>ktchcB0u9&6M74bpT?)eI_a7r7A~xcp|3R<o0R!Ms6B+9gt#~xrA6V7qixx+xC!3!YY-Xx=0HpcZ8S=?c+_*?2y}=D%6l(s1 zQk?AOyesIHdRbvZ++Voed%)$e5=WSmrm?39q)95b7!l$~YCvw&QKemZe}^;| zEHqn89oIWHIo;o2(o+2`z)Xz!&x6xWKpJi5v5r{kXCw1}iof>(5QFjbOXV)!t$qw| z-XRfT-aAK*50CNn_inj;-!=g?+O_jdeUetk)_`Gd&=HR&8yW$>ZiC6E5MGJZr|kz6 zdI}o7tNRNUIkh|JIomZi9OTa9HZMu$(}AnJW^Il6$t@}RpFcrm$1<9{{4Ghp6iAw) zoA|Xz25m)l(L#kmN!VauB$oT@FTRSi2jI+EQrZakJX~^8lP_K^D2d3F(>~2}2p+f> z!A`}Wo%Q+#`@S+~PBV9c%U9`WRTZZyu~4Xa@c9z65C2qbc(*K8DlazwvXcSPkU*ut z=lPJDhukNw{P1Bww%oyRHjMFG9z#0cxn0|i!_(vqjTa2xn^rFyi!A^7JTSIW3*hMU zHLKVLRo4}j$pXd%0G3J~!mi)vsZ?f@#E=~$={3F^lM!%~aAq%ESt-!0q9i)oYK$0) zDMrX+?S{QbIiC;dR;1e(nq}W%g7V^E?h4U|8i$5@ywXvauQ3gqr&X=q4mSt*R!qhr zDYQb_6dv1N+OJEin}MYNtWEg)hJzG7@4RW;&}<-Hj*E+l@wsh}O5<(XJw&&H(*;Fr z_`j@K$30{0(5vV3S3iV@$2Z06Wv9`w6P!IfQz5dfOErMCsx~8>U!8kxZ|-_*Ear%$ zi1@bqQcmyQ42C(!Q|lnNW{a?36?i38a+uMlQ=-YNx0^3D-}sZ%%JX?`Be({A76O%O zIum7%L2XmyiwC(TaiVgiiV9J3Vz4Xf`TtDtZle^UtmX1l`*(AKw<;p zt5!)g3XC9N(c73q0=lJ~8i(b97OB@Hk;%Ij?d z{JZiFJ?+3{p<2N=Cp3m;T_Hr*qv!ZAj1nCveo{vom4eUe3gj3AJb*k0E3o|rCvgRz z6rHzMsm^*uQZgL0@5^J-RNim~rzg)`yJ4Go*iA9kWWW>R%2`+@Sa~0kFR&t3vMMvj zD@`}rw8|PVQ82>1atn_Mt7SS`t+CMw!4W5he4iD#n2oh8 zCm1$L?Jrv^rJAFlfq;M8B2cIIIyzsy?$YLU41du zq65;;*AxKMBNtmus;o4;`5ebn2&c*PWUTb&%vXY4$!H*`-7p>^;uSipIUJ+Xj(^w= zG+%5;-ii&vcaOtJLc;EV@7!+2Ii7WRA3Hul-)}K?%ta~v zjdf=si*g(L2zKh>6Ok@}j|F6z?%8IaI28rCL9^ zY{m{3{DRF(U1kjM8gNhOekMO(qHGYAYFwAtHsRk_|B`XFVd#3-7Zshblj1u$H`Iq? zE`R?V8c3VOQkL9S{3PHwJ!QxS^IX5@LXdc~>kRYIjwlOogYxRc>SsAwn$hXMr{*Ex zN?jI-X=@t}{%WA|5||q0O^aMuY48kK;^rj4=eHsE8Lx-xoUz2{Hp%=VYB4AEsh!ki z02sqL6HS>-yc3AU!L<}(V+s#Hq93qh)hdzuc-Ho5ihQKS;U@JLWoS4S)jv+eD+uU2 zmV5B=gM>eh&%e&nFJCaR$8SySRs>Au*A*-j6^rBL<*g*pHdU*nu6vkJ9F=%TRD1m< zQt97qYwZoK*7f0Bx^71}CP(5c!6v1{bUbbRs@*^Ru4lxDLX@P2G&~zyN1FqTRUBPY zT_&6;a`b6rUfmwr%lDkWuOqV&nwSkww^d(U^q<7l&U(qy%7xt*nOwr&*xC%h z#KACIEiTDdpm#l6G~L=&(ccJxW|hsbPYA@)yPG>*$0E?R)zvC%iAgTC%QP6wL-V}3 zzV2ZRxW5n7_?R{F@XLM6$@vCn)|MQbGWH^+yLr!VU`mXdnUMFNn;)pOpR+#f(=UyB zH<7Fpj78$U%SGqi7#SakQ=<&Db3%x9UzJ?1eHW>%Aez~?&Iica zexUboZgdMHz=<-=n7dr)U+$Y7H0dG6DhVgO!LxBrJ1|N=<2@+xN@G*mAzo^Rfvt{? z26)QvBQsHF@$E2MVBjvef5^nkAEL(k8jaWtqXp>X@HA?0IQ1|?-6X64Zl4(e6m_6E z_9N8KY{|Rj#mRcA9sy=}d`6A;ECM16GnRXrY_<#t2NNZ_5r@ug9tk?W&h#qz@@TF4 z69OT{c|{Bd@ki>h%A{s1mEgqiq~JSj5#g8YHuIO)>ZT1X|5G&#_+2$b?99{rXHW*D zgQc)P^Ir-dhP-v2_`Ht&zh)Ud>kVy)yWk78xGgyQK0GI7+>a%b6*YQX?maz_a-@$M z1GQFt@ILr9`@P)!?aJ14AAsAjR4$#(w+X-x@PLMF+t3CoHVJ(CKffH`_EYqAzSsL! z?Z`8{9(<*6+rOYvRcE(IoiP?C!k{GmusF=oacRK=2CidrB>sESr(xwe9RLCPh zEz!+E@C+qTYu6aWL)9EwFIYQnE&~&1m$CMm zC3ZU^nLiDEF5;RrT>+C91{0DPQaN5B}&!BAO7sKhmGh+f)el|IzhUVRa>2+h~yB?(PnO;BLX)U4y&3 zO9;W;-62?TcL)x_-QC^o%=GT=-Mzp6JP#L(bpdPEsv0%q7094t0W)RgSgXet9y@#v zn|WM!dQvxkc|w%n4oj8N6fthsvyGP>*<>1WJhKs9n-$^o%V1OUgT;g&g1vnofH797 zvgxghEP`~gT4`kRa2B=i=~{75lh^GgFh-~{XIN|GvY%DnUQF`D)Exn78+i4z<#HI^ zreUVGgE1aUAJNdASPN)`{pf-D2 z%v(ryp8EmfU>ELM4U#(ND&?s+fQ$q*H1uv7(^`6ZLwV1T4esG_@k# z@@AmhL#vyQHHP{v?M~~~>w`+pb5u@d7m!ukw|Tyu0{H<|tliBns{~C;9UCrkz+}C< z<&F}tZugYXZXF)j$k_CJnm31aeI}CIsjP>gpCN38R>9k#e@(H9532955dw5V`!-ZO zJhIfmS6I+d(s43Mahmu_Is^rJON(7Ph1S}BQu9#BoxI|te`Cp>m%t@488}@-XWU8$ zj~VUFmMkWg`&1ox&~w=h8~dUe1I6+q;9y{{32cxtGSA`P+rOWoQ7P9gu{b~>-$0^L z&L%?CkCyp3ep*azlo=^4cp!@*>|Yh?K;dMwWdGn{=@P!W`alKC>~?z7JxaQ91h$DZ zdHbUTXQo2r2tfPVtaXG%R6Dv9Yv(AQRqz^9DqzCEmg%%so4@?5W-FN?JJNuO5_dCB z)}ReBm2gk?dtSwp;FWTBEaIVjoTC()7s!eMTj?Oz{VW$M$Mi{QD3MO5)(+V#x)*UU z2@RY{nMMsHY3APC?JTKMug&3TakQnOHy3^#z}gI4iOF^a^_`0Uo-c`XuJp)w={|?a zS6GQC#Fd4ggy75^;_;`fR|U8A=aD&5a0j9U0^MPW6i}?;*JFNs4%?Eegd<&yedoQNMg}RjqnVoOWbzpzIkP3;FD(G7Oh27Yt{F>W##Jd>1^~`?$;}pM zU-?*fh`@5z;I7tPbq}k?aKcwx>B@fK^CL5oz!&CL)Qx%clYqm_#^+8)w>|c;tXkCvS{7ZT~>iI@PX3I@Sjvj~5AdRfvYV_sPG{1cNkWjL$JYm)( z2g^NCVo&W~ZhW4bZin-yLY_iB-N{{x<+xNMel;Y-LdXJ5?t@pGoo&K#Ry9T;Oenb0 zY4sn$;@8}Ft}cNQeP$*BBCqxe!E#t+WUMs(lCW}MZ<%}-NwtY2OJ7(*?nf|kui~(O zZf>9y*q_ZA?z2ovm&?+R>KK33ulXvI$`+lejj022&R38!GwDnVfGxgbl<8G-WyMr2 zFSxnA^APcUho{}Dm{R#>*<>hUT%Ad7)E-O=%?bPQuy-Nus%T76L1Ci+&cR~WYN9xW zW1}*zC?d`2Wo#6r61aPU5eXZGq#at-D|jbM z)~ojyAi->x+nnboUiX29MnJ&$2R!e@9?@ZZamn9T;`xQdd2LNUonk%*3`Nmxj8r_; zhp*TJ9;J$WpWTG#r7HKvGkJZE4-gtqEYJpK>7z47ho6Fqg+IZdsUxut8FO?2AU|* zewF~mf%AG;hqxBJ)g~6n>9AnY?L^K9A@1;%3e+TA`OymvJRf6A2912LTDz-6M(U8x4jH142MoDZ3)KhPGcqTnxH#fbudVK=$;V{(D?u{L%zKfx z^%}e@7TRkd8EN%pswEy>w;4E*X&IJVO8SHJG>q zVp)B;X1d)N7w7~or1%CwNUM(O6~?l5^#{RZp_^6~W=Xiz$*M>6wgAsoUy zex288>Us&TSk8V%^h~_R?M>&=_Zw$GHetxn%hx=>`EclbJQcCSV$YGepCw$44&XA9 z#jb}3W50kXCVoRkd!B`2737PVfmK_Nu-nm@^VFvBhcco96%-8Vh`v~_=F5%$6&)V> z4Js-`VpRnf7kBG)!()1I*5ia^SVE}z&Pu09Fa^h1@_K*9WcT>u3Gq}QA|SpOOFu|l zpSVJaJ;8=NoP0bMD*B%@ntW7%pBF8w3;rSSp8?}HX5L9g|Fw1j$Zi)6!p@5 z8ago;+A@{9T>a{dW5;P7@U>ZVK@r-0RA*TmU{e(t5qP1Tr+k60k8t#TiVr=jZJLx* z07xv#rF(H45M={PM8-z%G9?AR3OrDSwZQgy zQp=s*4$KtAr+ZF^;;rN$4#@|T^9U`qpE7vbZL$}$Cr9BpmQ|A;{wp!w4K{jLS9dA| z?BuJT;$uo=tv-pWDujr-D`{~@Ie;Dh53de8&d;Sn8@6zCV?0vn;7W~tSB4_n?;Zch z$=06dlMho21T)5H$z(ppB?8Oo{NzrTHygAUCI{E}cIeK~(E(MMZ+p&s&GdSUDYW}3 zgLc~hMf_6(iCAcFqj4NYDo(1j`(byASZ36Dc$xWPdrj|s5?f4~q1#*6GY0$F%BrUu zzbzAp=`}EpF=4VW2lT~s4T#t_;23e2i?m<9+vU_AFP?fAKf8h5PXTZnPB!{mpg|}d zvW#k1FAk@;RHjkDRaxU9dQi^D%p-LUMT`iirIQ8#r<-C@IJT|1X08b^8@&{BDSwk!x@uU$HC{G5G=c0BKiqHFk!wm%(0fWlk znlF0X^)8;CH>@R^rOD<|iNI~M+~_`Yfr?;*xDkdBGZNQr;^qbn3%+CL+Yzyoj-!YP zg8rkVnM4GdBI5jK0ji(=Cw>b&l1H<H#w_a#gX&wAhKdN=*WY&Uks^CC_ zF9N4(s}ZFz@8fxLrq(Dx%r|37?75sY9+I*ROqS)<#&wL>V9h?S}JpgGTzArATh z&|V7N6F|C2i@PHo$lV zh~)D*+o=FKr?0p7yimB4(-X}on2C>%AX^fhftpcYMt<=JDI<=s!PWKyb8o~)V*u0& zv%^5dC@wSw3{BZ!oT2NF7b?sk$Wpc#KZrtzTD@fPQ?@ISXOWCFdA-nFa9s9Bg%FYn zN3HjSKt@G`aZ`XR3{-#zu_mL4k+`u$mSLNJ>*SXfP-J~S0|#WNG&B+rl0FSroH3#m z0G=$&zmQUvAVHYJ4mTvbv^49;^GAAq7(0J&zheq??2WjGaoR$SX_89p*Z+wqAV_8l zjT*xgqnx4ME|9?W-Ipef)P@Z}vL3D<;(GnqWAxr7z)gflMBKoh14RPI5lRM6uA?8~ zc6UwEIDRB$vl5?|I92EXT(oikp`so@nR4?=AAbkJ$x(l@LT!OIXt4lP>l3LrnJr!$ z%L%a3^oqrHJD2F_5sxB|F<@WOsnwI^+Vy8p@GAFYpzK1DYc2X&I6_*iRwS#`NVqe& zAd=;CKfS*v%@;Q4OwAn6JQo1f2*iJK%^L%oa>Psrw7^&a*cXtI7~VE$pEmN?sjy)5 zk?Xc!8nlh#u$X|}QFuSdrK}+-P#syZU8e9u*^tMJz{A5DiV)DRHD2CkbSn*CTnyO6 zAmQfGqP?t7#R`Bkfr3cHCe^wp8m(|_$`yQ@=(^Cqnq@T^t8d+NIGQRV>v$7kR-o4{ zi=U6D^}Ubots!e6Xvz~dc2T*c3*iA!`+s-%1e%EiK0_RAvi>6ZS2Ga!Mf?rCx%No8 z#p6S}Y+4()_6p%S1vmh;YjI{V#=(27$KvJSAX8^qzXVQnG=3H<>@LHv>n8#N7GNf` zZqjP-4uEt)%Js`;5vlU@Cl;9nW~^5Ui828y)`(Sj#0^f|kwt_~KhPm*Vq$Xi_MTMN zChMS=N>yl-QSOWr;NF^afIb%UH2b>am@X3onR)vg7psLDP0|mklanh8z<(mP9ES}f zWkysEcRWrc$4i)x)R3=Sc;55=-BNeb{#tN5A)Ebl|Jf5GDI`J?R(|Eb{_pFAO!(P(OZBM2@4 zjVjU_pRKSNG-3;?YHk(<+FW1ZtCkyv54z2hKD`ecYNJ*&k$pt_b+u--EZP)Jjp~k* zfSUX7>=8~z|I~<91|I(FyS07F(M*bqJs|MWXMGOP+pH)Zo_1&65ycS}7L810iu66( zUo9#9+RT-P0S#4cGB{+!9t9aYb(PLH38Nm4gGVWhBc-a?Dqh7ls>%2S8MPb}p$n(4 z??I;@FcUuq;^U>c16rx+r*cvyV<(pUdV7qnPBwN%pUN1;&D-gC=w)6q;@|c`ULlTm z0${WQ{Jz*HCo0s?YLr$>KX-omVHngs>8l&h52a!v?=wfq8eC3wbvy|hVIz}SBZHiHdCq)d<5{Mu|oxB^ZBq}arsn`_$XvVzElB~hA^nQEFP2b z<37L*G@_iFVl`W>9j&FllEZIl>iJ2p@idY`{#u3h{^Pf)*RkrsO>c>RoP>AC@}KM6%|HdDG6CeRO=A%#NQ&VC69CmSJa#%(k`$Zs5RO52%42sNbW_0^UUYlFz z%JAOIFX+7b(}k|xzVvxRjPat)eD02&bYoaD_pu)gZZEg+sE$*4lo?IfU(TV-CG5#M z0gH6x8u#nz8tZY{y-M9}t#0w?ugcb2CeqtD_H0c|^;}$ajGuKt1VNpT!{ znv}}!xz>W(8#H;6*olpRwtgS;-o>$)DSd|RTHt#~-7IVvI}DEz zIJE|Wby)P9H$$GC$vHgo;H->~DK{p!;xPm4!-)8GOSQwhTSa8pNLik3SXP_{rZfSeCn{JGI;I zJx<|korof&$(I<^#am60(D|=@gPu@OK#91dh?S>d&;Hjr0vf?F&_rF&y)lXD?<{C4 z=6|HJ6B9Z4R?YCd`kocL{i^dF2O27uQww&&+U@o;y{%#Was5`=nUSZ-UC=2M*@HQS zsvRymvq5K=U_TLVF0PDaZrQt`Z|6^KS5}01@FVGUi=%PJGg(7!8}jU2f%jVmUOq$scnR$Nr70*K!>tzi0RdQ*msB%ZJ;|A{Zi zhD{*VJmKj)7^i3zf`>3}B3##O-gmrLA~T^0+zlrK@J8>^psa0IclMzvPYb#(9)TfB z3S!+l`Q@&3Do9lH@jQ!J>a^uNP*<`cadIrCIy-SWyy8nf>8jT-`{QJKi!n@5%SHr z_?}|=UnTQ-3$)S83VZ$&@*hX?pX{JlO;CW=C#>VFOM|~NHn(7f!|Vp`o2|`R=Tf#X zX((@-sLfAxKRS<%8iQO8psXw`I0FI#bsrw>455@n<;;?=l|ANeGZZK@X1D~{>i9?@ z^kM6z2Pfw#eF$+v!5#c_Y*&d4 z%e975JC_L#=H?Gc~^6u+(#rwvmYo4 zU_Y}BoO{x%gj6l>KK*iaxcT=yhzQ7VkoQ;WirTIKv1uI*RAY!GR$HA4CylWp-O*-% z`I1`S8Tyj-W}VcM@#qrFU$oxP9P~}v?e3MAUDWzZc`2Kj<<2G!)HkC>k0Y+I&!X?U zA5T(Mfqv-V!3H%96r7#2YoMv$0zfeZ2n_EN^|G>9_mmEHXI7dm3<1Niy!If%Eh4_( zdKbCLl$V=5)LnG6iKR%-Z1W7AS%}JHMC4yeBq;N3*CCuOXD8-n_D0v}q;a{S<*sja zpjsGE)Tc`|C|&8T+rtIDA&9WR6tJbY?7E>b4@HQ2pw3CQi6l7$!@6UEZvPH3i5CjJ zA8xvMu{fu}rF!cc$@?&(KfrWCJBekUygAHjHC~gLAeud?p{PE&6equ zQ?x$=sWM0+z(A(bj(+hB#c9Qh<1PEq_^!em zht2GkHBsP#kdu?a2o&*hS7wt?N}>yV5Pr=UaXEIsRc1k z2beJ&24iz=CPyAV25Hpx%`4`W-Zw)kP4G-{yk5ulOFNNRhJMz}M-Rhp4-Pir%_rQm zR=W!;+5MuGQzeFw_y~aFN+Qq?f_RF&yQ z%x3L6Kh@ltzD*N+HpG?|A`a_JeMw+FIX^bYCnAOO=N1lu|`8cJjm?=j$0w{=)X=J&Ylf?;7T2R4Hmi5vk77iIUL$pGG4R6P387@K?Aj+@#xTC&k6j7t#YNk%K#4U`nsWP+3!=#3@!P6r8Xg- zfD7E9@<5TD+Y%pM_MS&0DbhoyA>4Y*yh&RK|Ja=o)Uq}W(L#qC65#_~K1N4Xh7|4G z$eA&jy_P^`n^C5lZ?6cj)~*1bNPM#E#s|cwm|sA|pn;QyL z`jlZ~5JcjnC4XsAwNv!nxaLF^4c`K&g97qhT_0Nd4D#<44-Q}75kfQme8&pzQr@tE zu5E7#x7_4H%dh|gAFE2#-=C8gj7-yM02l3NIFuan#CEoQqm1lr&uH!q#({WYj<(`@ zJ^-L#x{9C>GXq#JMGst+pI`&@^70}Z94dy*|p~KZ4vMtZe4R~1>nGGRu-ww~>GRfrt zi_9lnttV`;fsC9tfK=l_54~FEKv)HacKMaVELrRhy{)^omxA9SE-=Tam!`3FB7(z{V}M~VmLfk$>*{Q zz6SBL6X+qru{0ooNs)mKC(z{Gd>{ZT03*|J1Ao2WEUYs#v9b6%2@kMl`vZVQt7-F6?5 z!~K{G6H`K+)ZQR9zp{UNP&%jpocm4 zd@H$`sSUH&UgWLy(v3WN?Uv~>LDbbRevV5?c;f_4KSUm%g9%Q>xjH>#JaYl9BD?oQ zC*HGk8jZSWnrt>cKX2!8(%_56G7!R`_H@n(va1DyXHOKQ;`)f4j?9LbW_xxQfllsX zvYjDC{0EKQvXzUK;MVdOdSun1Lk*E*{g-^kp%IQ--W9ekXGeeJVr~| zEAv98(ZvzFuK6!r7bcB2UJaN55#QMFC0zA3zFT9XNvpb6Xt5W0qJjo|2xzb`XZ)}= zeCJ!hXzy-5C2oI5g771F(caN)@zhkUXQ1XyY@2PNbV@mebRg}x$J8QF7GJu{?{wy? zED5$jzKdLzuzSq%o$rlCG|~R9J_z74fVqwP%({&xqpJjiQq9Set4^Z_GBUz5y&WQ3 znNElAI3M7awF*BMRmAt#I`SP7Cf_@wwErf2)SJ(uTzuqmZ*RWTWVb$W>rUZC zrg5&dnoO4i1-Z6REw4IS*NrC=xq-L{V{R8C)hx8WQnhm~yxkLTeLpC?PDmu5>t=pI z4dClQ!Cr1|I&47lpQB2bbS4_bV>S(@0OVeRTz4gRq3vpI#7^=-ZUo8#f2Z0c?U4qJ zN_AQ9&y$Td4`&>y_|V_noz*r(g=wjOMh7H+ei6TJxK}jy=6@m={^%|N5Lvc81Y5ac zx6H31fQAkE!p6LVijDdy=x$1+{4Lyzg(5V0xyyfw9TCOMX zZp=jpP$7Z}ne8R!Kxp^9qE{eQ`?+9rgHAKk_z$Oo+t;=ilR*M^JbW^JEqdb}ILQ{JVcGK&a%lVv>#qT>0G;lLJSSf!u8EHoKwNmY?$eo8WW-{ zF;PA^CKJ)-sXCariP~sny30JQ8dG%hH-VqTeoWWET=vo{7VFgcb(6a06gq@PX<(zf zk8DSHD4`^o({VE;$4}smUBWTUh2#1qdr5!3Lw#j)y6jR zp%~mqEEz0TI}nkfzv=v6+;S)2QfF82kR!jWSl?b?#D>P!3*xG62YP?sh7bkSljC}# zz4#Cj9s??5K(|Df+|hZt5SFO<;ru38?eN3v8^4-tIWUqZcgL_$DUT_Of+cvDuh#jV zGNCeyFdeL!i_6aL!WKqcjiKdT6OLEMx%W$|^M+lM5u-yeS!u5!g=Jo^e#i_cd0Uwj zdu8O#55~rBQc|P@5J*^0Y(P_~aXszw(miM&(3ou&SM8PvP1K$p zEiK)~&zT2pP$p5P*F(mPl?V8dd7I?AB-L2}-X^W#Q#!Z?Geu-=Z%{27&_}VqxWM6@ zzQW!Sicp=Pi*m79vVIbd5FSh89&gOav+LpvaX^A{7Wm;`SV+M)Qf61&o04C=Js!Is73c4xkK|e%PVydS{=J!EiVjHr}wnanQ@BhG{ z!`$vZ5GnlSkn-6i&Qy0*s=~6)FO9cN% zet`75K=*+GiHD`q%{N;t>?Xbk{rO;5VBKgm9mQQ@)3Uq)O~=)6LX^V)!tQ8c_KFsT zH2F=LCW$O=;LQDYBxSvTfsL68rsH_3>Z zYW?q>^{nw*Y}E8&Y9-{)6vjvx6o3Z-2$jnOxp!aY=kMt>m`Y1avGFbZ z6o61>Zj`UpY!n0f0Pk~pV#fBLZ|FV*z!ghydrAu8iRR!MN*x4F#?AN52U4GeqOO{9fG+A4cS92|+zxNo<)UeDVKj7<nb^qZOdm zvuFBfT~F}GGTQ8}`lR$kqRKOc%5^h_GPv28Pv+sOAVWvQ-x`pLX9k{!_ZYnD(q_lX z2eEJsylTqcZOOxU(QSUv*(z)ABvt$@*UbXYY)f+yi48JTH*oK*$gu}&=;myp)?*VAnsS{k{BT0nFRbib+GLW+V@ED5gph=Cm_D$@Z zIlAkY(-@n}PEgv4pAuc&{xs0?gAG6UiURxzj1~^w=hLjUy=*M7Vejoh0g)qa4<=&l zdZ1hP_~B(s-269U>XjT+hwEm!wvUU6DS&@EKxAq-kzS9k$s!TBq|Tpv$woy#Pq<=b z1w%XVNV!%t*ra%UP5~I*dn0LdDeyAD@wqgnRVi1zI&Di^_Nf^?#9=*Dt<+4jG@^8I7ep!K3sFNs$Zy3P#|MM(88-g1c7y#agYPkUtVJ<>!3-dG~jGcjE zBK_$+j@IixG=iiWG=DhJ{(LoLQM}>E`Hbv{X(KrkeQk2=@HWA_5gHnr*wWoE*D74Y zZj0hjFw9Cp4wQ%;ah<#kPslOIRrdA(VACUN|82j7vgHYo^YGnc+SmQhjt=OYS5T@k zk%^r!?Akn7aw$`*mI~#9u$*Pk>97yF_XB{sQP`{^(*mtc?=<1Hfl`HmeP3U0Njzp7yHmn3TofphygYSIO}?9S`{!>ta2 z@7B;lYEL(buy4##52ZuaPET-DF7|d~Bcj6x382T(X-k20Ne`a$-rnAR19_u=JoA{f zme4_6AAYgP!TD_dhMnQ7%^ym_;(OpgvktKqi<+K^i6-}~lRkC?Ef-|_*N;1)uXj$) zEwqQj3FXtx2`oO@UQI)}@Y?_-r@wb#t$r(!zQXlU-RYA8XvM~-c+l}lVJX@whwlC; z!r-Vw$D=V;K6w)T%dVc0)>b|7Yk<%5*S`MGS&KdPE{tQHVEXHO2ShHv`jx+=0Fvx^ zq23l*`+y;2Y15JGiSuHWyUpm~juQ8?Z9~6#`N~s9(Mav<*|0Bj;V+HVju-FA$B<4v zY5m@9#>$t1g_5JW?@Pt9FCF(KW(R+KtG3{?*4j2ThTf^4!?8IRY8@Hu-3C#Nzrx`K zqCRMcyp?D*8;tFg?ukYvYnK?XR+x6{B&C(AG60iNsa!&So5JmEerB_&W>VQ_M?zuBmIp`GTl`-;Q$=#my2TBC_(u|n zsr^v`me?CA@NU6ZhVlcFVn2yrsta}`l-NL6_#S$G7B%Sp+H%A zE2ll9$Ee~aJe~d?cg~t?zR0x2r>!7(yY0!xYg~rL}V=4&-ZkPFJdmt&`1TvN5X6L6=jWd^@q7SsH#gBELUXmDq zLdD)K3Um%@7=v64LEJnbfQI!=YlG4+& z5p|vcx|Db?m|3zSA_ew6iA7^%^dtRqY2&2G-&IRk&7ZB}4Ulf;D)d;I>`Rmf9gK&S z2U3)1d3fOJ&Jd-AzQ-V?;K zGWGUeU>PtY`uInmtmULL+i8f-PxNZ7G--f!eIo;DOfbqS1+)ftS3HIeCH`2}`y;6A z*v_aj?BHYa9tAST(eEd`BTJ7@lmspP)Nd=E4W}y|<bp?xP z-^-YToZrD|96TH@;h5m?Y$P#(mE3X$q1!Gkbg?B*l(GLL9mh zvJw)y60~A}eA<1=;o=Gzsy^=@iZ3qVdY9q2`eZ%VvZf#C=l6EDFy2-Jh|jMLJMy2E zu`&~NBFW3kJ83k@NJ%|Sr;vs}>P_%>Ch=v)TGLQb$uojCUl=EpNvoAhVU%ZZ7-M-3 zOF_i%M9MhzBOniAU?c*Z;UXe%wh=!Y>k^u_ipb1-Uh~I0{M6Lc&1xG)ToHlGmb=P> zIIb_bq>_oXg312M9Pb9LPp0E%HBK9Uu^uep*C(^u8{{?St`zEI=Z%9?V}gNgM;uMlt_LbYTwNA z$n2=M_w|id{n6Lm!|AZuY4(%I`=C#*7*<%noxm=G{X(h^HeN7`y>^P^&KA%IF@+__ z_YD(xQmQ|dX|+w|=o)5bTGpHIxK%oy0WeMn9F&2#-?`wUI=NgS$Cn)`_6!bpKX_)h z3q_McXmawM2+KR^WS(sTRC57G3Q9^P9N0&=Y`$PQKhRlZn zrx;&#C0n*emAW1%IUd*F|Pk} z86IPog14n^;~-3znTZL45kln5v@b*;`y3^i+ubCP3js*@N=r&q%hd}$uRMl^K3fb2 z-8p@7@p)>s6cwe=pn3TWi3tkH2Drm1LjRIE+QhKI<3MM$b3*qSGzZxL9`~$puYW zg1HW{hFOp1#a4O+<_{lEx_6dflWdA4p~?i6oviu^Pzjm|YiTsDg$jGReW*+6hvEOQ zP<-a{XMT!w=W)kE!7(t?pP$e9oT6CZ>pk{~Macch;f$~q#X7lr8)jmxN(JyZ9wnd!Kh@j3r1HLwQjZ9H z2Z{gfzW3KpM@(ndjled0z|YA)-#ZUR5|@%XTfMU##0ngX99{Ly)T(j|NXG*Eq+D*% zgc__Zi?5Man%9StH&Oz)?>$_q=)S7W68y(HQlJ9qBg@DA-F+78EWEDS>d zIM!*_+I{7-@fd{$;T*c&lHwsO9s67`<9ONf5arlDMFoL9{q=QM}+m#40IOh0jCa)v)OBQ2w{u}OUg^a1oKo05Kk3N`r`j}-v znSmCQ=PM$9C28J1NXo)u{A%+-XW4<&Yv)^JG#Lc^AC5?!m8kcJCv$bx@M^@AUu z3;~T$yPsc5dL9uUrVQYqa9(^RNE%RH7{+ntKI4@edoY?kF-sW_(;lTuRN@02(CJSeCYt>*e_ zs#U5kEw_TRhXz1K;w=rclE1&ymce7cIvz|bH}AUx3(5~#W8eN!rQ*7G+}0oPA5@jU`%IrpkOs@-hu#8-!o%f_ zp}d{ICoeBilfwG?$e<>2o&DMJsm{gk@z3sLp5_BzSzCI#g-$&iJ=?r@2-k7(@kNqa z`lhF;gYgoGiZz==0an+(Y%06UV*CT?R~h{TG?PxxpuyW%>7ry7pV@unI9`HP_s8Q6 z_OtN`g%~Hw@=9BMQ?>ebA;7xl$bs4EXtF_uR?G&XybQ@c@vl!WAnZ4U{L65k$y^$L&}#jBdWCx)(vfSUF<|Qa05%Sh-d<|Fp(WF3f z=4i&cxTtg-H`%zxJZ|>d`x$U$+VafeYlDSBkp31m{7kDPrL;6X-RgAUuzgY@-_Epm zjw~adqhu=ANKQ`aa8*iWs#8=@tFv&k-3J5P~Q{a2mVjkAo}vo*p-=)4c#E6h0Tdnt4Y!M z(ZN+Sb#&lYzGHvpHFQ=Ay}cuZKLnfQWGdf-r2_Q($L*zT%s=+{U4j(e+BpKw;)YKOf}pU6ec)H0NRc+vuoWF)cP5!+ypFkgcg zOAB$>H;v0~C8$?1@>AA8uaL*Pj6y10~q4X@Pyuf_j6 z`u^($QP{g~IHMT@K8w%yHN$fJe4VZ;W zpL@Mb^s|5D;$kBu1vdCfcP;<{sE5b@e&w(@m=*W4u&Bt9O;R%Sny?}zCg$R}uf>yq zfeMC%goKNnRYgr}cxb3qwC-{gXKr$4W|xlqJ(sfP;@f!oPddTG6kp@Q!cg>Y@~Wze z^788x5>isbBhPObB?$>{Z+fmJB_*a~YYvXSXm}_U1qBN2Rwc6^@GVp;JK^KW^QQze zJAgwty2rrRm4iGeF=%1&{p(i`m-@syOV#g6rLln1VOdp&lw8Mg8zh6CNBiCF12R5p zT4tiG82#tZnpFWhTM(>tQpqrD>RcK8q<}iHIcnV0o{i7zp;Sj}U4nZa1gmC)9T7)5 zBly3)+WxT2dgq^xOmxTT*~h&fI;JZ03dFj}$$Sj~DrgEskZH@M%uT0S@wa71|d zKffbD)NS{EyAjhD_wve(jg76SNOYFCvQQajwJR>H1bh-WmZWUR9q8?G(9x@KoA1B$ zF1cp%d0*V7HFQud2edL8AD11@lzbxOq-yk!1#Mz7vx(s&y(-cQWSQ>sNE|5YmF;WJ zA4ulW**-%VMMc4}zn|YJ8ef|%kqN&kRiizduds&>+sxJFd}F4k->B#hduwF@iHTv{ zKA4=QGO;l+(EHi!v+})ra9D$6uBH76JX-GTZia!lpozl?EmYJMmw~;;&5Q_nXmg;JxX| z4;?(J;Qv`k5F{!baFn>%@pMPS@lOqAJsB-ksp*4(kFEgo4)$sj9`8F_v08b%P#sLS zO+e+^`F!mQR$I-U?zxmrwuSLijF9K`(Uh(X>rwwE<#_Wg?sE-C&#jDO1|-)+I?LM_ zZ_%tZm#|uFtW@+@-CS-?wlq#_vxn+>MR>qsj8?6@W*Vs7#Nm4ut>z%duL`U_aLUXy z^YZyT(p#jarQLfqie9!Lu6bO)q^aG0?wYiAT76$5R%g5P4R;I%#Ge?q0guz^8byb! z#zVF6=qhtXQx0d2Uke%M7_Nzy-|gCBr7%#fVogdyB7uupCu^=;$NhPqPlt=uwC7Vk zPMpJQY&q@R@rDOsBVjh~T5oTDP9A1D$H~xv#Qcw)A2B(ry;Uz%>0d8Nt97CU|8b4~ z9lNSU)_;}<6B8SpetW$69!p576(^1}<$3Yyjh|G$s8iLd;~isKqRR^Fh$g(Q25)I2U*S)=%(eM7%;^Ti7_D!)suXM5pcP7jDDT*A$9L3jJ96R(S?R~uq?5||A;q*E< zT-n}6t3oIg^hXtfbj{h4{|Y7g;wcQ%3Aot~&1O($c6d=*LIO7imx~#f^JXVa_bZO& z^qGu3UG9Q(81AEYyWnN8-0I5H>&r$Sd&^LF!BVL*jZlF$wNK!T(4Wqee{H+Jy=33{ z2MDd3!mI|=P#%N-bvtDLLT+?^g7&--e*z>|689^vF~=3#ESeZ}lRA7GJ)uL@WntoYc>S{Mto>OIk=5cRH6}~-VXddX(gQJlwRlH= z)w-fnM`5vJHPGZ7xCae;y7MsTmk4kvUy%JeA3j6;D>3@>xY})8;uCzyTv0F*$-%EqHoB2V7 zTJAC?8+Y`-gHcod^n>tu+!R7a-YMnI;B-}T$b6oT?kE<0(H+af7zkm^ev$7Syil66 zU_WFw*;J%Cjt<0sganT(lKrlBIFlw9&qoUBXaM!!`|h8YA@F(|@pl@9Xj%r;FMq@( zk#wL8<;t&l zasC#^?!U8%e;$87@@Usg^6&4mi%Lf;$i-ykBJUm&l?q>vI`SC)I zIUjKcIdBaupv<9Tq0Nq&mp!vzq}xXZRj&H<^wZ~-!7G}|D=3_<m+AlVsbs@)aiJ5$Nxs;FC=W@ z)cGo8GGR+;Io;WgZ|p3H`2X5pe=ImKO}n&R17Z97R*(mW;&I>Y;}*$-2si|MHeU>` zI%C3m#2BQL8gr(9QK{x>KEx)P3=K(kb#096w!^}P8O`N#cW}b8i}m+@(BjCP8IA!u zT}#{%(D3MKVp>{p)tl7RG(fCYb}HLGe=k*EZ=qYMnNS_8)0Df=e1%1B7{`ikJzk>Z zSf5nEspW$8dNJjyDp4H}5a5k{bZKnu>{Lsof@`dO<#89}+W&mqw^V4IZmaXsKQjG| zokuQBF0;*i$z57n8qXFT%v&e8S*5IeL`7M-yPH(c!zY`!h7BmdlyhEtiIWhlajQHy zFZUSQl`91z{f*|o;u$*q>1XL^YhXvsrRqM;_@qG8+zVD6%S}UBC7uDYl506qKyi zIl^kR{lk&@-|OMmX-nz%F>vh_RUhon040wG<<^GoIbL2-p_-F!%}01#(SL<(hAE-}^ODDOR8%94+*tgPE%;wUlnVhwJfMg-%^9 ziztWd*;1)a-=)j7K-Sk^Vb7QhYn2O)Z`&!j$8b)#ysi_wQbp`FLyGl@&>K-AsbOlT zx5xYmM+MRke1w(uGAQv0#;;Ew_~<`xRO6>t0Uv5bGl5#x@6?`p4u7WXY0JUh`>cKia!XrJoYT_-R$DXNtf z6~C+G1sFpsX}&b=d4?)8GP0RXefN)c*9ZH5okpTCzqzIH){4}b;VYJhUu2YiM&W{2 zh0Z2&xU-!(sRJ9Nix9lMMar1KRRjYavCfv5oGiTnf4a7)H>Mjs{&{(OCmnNmY}TQB z-Y0{LWk-=y9;*i!&|-dA8@p=qemt(ZTjz8TvOkLb_*r_biqtcNg|z=ePMdujklZac z`(Vr&$#qe4ZUDDHr5@2-ZGCmI=0YZ(4k5pF$#5K^mpIy9DVP zx=Xsd^LxDaHSYbq|FvAR9@_ckK6~$T&ip-{^TFJ@Xf<#2e$C!zGVFpKISIY7=N{Anvko+WWVJ&os#bJ1Jt38r6qL#uy-F8%!zJs z^=p;(tdo(6%|65r+?t&D)Yb8%eL}>oOC3sq@{gI$W!Pv+?`?1NWu#U)6}J$YTT!@R9RVCYe>c9)x`L1U+V$x=5&C8#TalaN0y0h4*LW*$ zp}9O2sO4PQ@?;FIY0d}uYzJ_2IE_b?OX$c-GQSd!sEle4WEebd;Ip2B6vB{%W z#?xO%KVQaDcnxpF50+3aoa1o0dL>ON@iFw)!-Kk3Y#nG|W329ha;p1M?@9sm;X<$w z57Qvj@2~g&zn34ZR{@aOTkM1K5Z0{Mw69(e*EGkiPS}GgJ1R^J7$9>eJn^ zyG6d}_#vL9B`3WN!Z-q9q{*y)&0dq0UMak>I&1sKj2)f?!_`LDx#)4yw zT3Y;#?K-XNfKaBi%iO4y9&+?4v&SdmDb_M25+lJkkF(juLzx!)Izl!o-J(Cm0j!$M zku5e2!PlvkFI?ad;NXRN*b}s0JQ&oraS@=Dnv=fQju$U$|gPlyh#vGP>5CBFb41vYtXu3@@dKB|?p<#Yz*qD)NuTYhvMUR5m zTqfiD`=!n-w8((2_QH6w$o%#@2`Gq@Suf&sajGmn_4huc>Lk?owfg>in0oo_IrZ65 zV#iv!(dPIr<%xaj91A8b5KCa@N7CfyV{^HGU~B9(a#1Osp*$FJ-A%yEmc;1_i;anm zjRl^*jx=8%gl2cQueH{uo12SwrA7!ck6UneO%KZ0J3_4i)f0HsJ6On@&?w(yVK06D5+_Ac{2`ywJ$TDVo+0GdB0Gw zeliG1d%v*ge?B~f0DKDpDVgSy==;Mz>U#*EcsD}1tNKpas6z0u}1ZRLZp+1dp)E{^#&l4IeQ{r z`*=3f=^A!SZzDGA75hT`BVCrT)P|IZ2r8qlVuvA=t8~iZDJS4g7FWVk4SnIXHu~AD zczqkyvVpf84{RMLO@DN0mWBIhKXF$J2^3uRy)X)GkCb?}#v>)UJFMS999*Q37eZbAH43^MNtff+z^2L(cHUxc-Ue^SIt%9i> z9iB-;$kBzIn`$CJt*JS{&!v^8mysX48`#-Z^72e0rG5rtgUsCm)oa=!@1qqU^715j z_$Sl+>)t1JK)TU{$~!xAl9y=9jMlrTIV%qUr>78teb&2=1t`$K7JDn^)rR*o;t^1< zqm5(DgY{&iT&wg1!Lk7L2G>NcM@TD~xTDRa`H_sk(Q~prmQrbDi#a*L` zxM>||WFA|=ipgsZEh+@-AyYh;R}d5yt~EF=;IZ53`#THtF9v(@8UH7NmP`5)^nbtz zRRR2lF_-0bZZ4mV&tcusm&=18TKjjM_My26p!{xm`>n~O%EQygN2ibV+2mK>jnTdx zlc13bd7^4yElxAjCmc^WV)`DV*Sa+zyuAljR`{A7IO{+`Z12Z2s)~wK$-S;oxxqf_ zah3~iJpkD%qF1y<(f z^rc(yI~JKe+*;~_y2F4VAy023jEkLmu+z?EpSRO=m;K@;MThPk4@pWUO>EC#Dq=l@ z{!gg(8+!a4uQ%{z=di`k2%e}t{Q1Q$4%jG%FQui2msOl|@GjTzCB0+$YJTR(k=tAc z`?6Y%mt-Eq!py2_;I8WL&SPw&uj73~PM%c72V46c*$WyTyPkTnr7yyZb9I`O#Za$m z&P~UH%O@;(%lg&hv>LGzo8|q6r!>B}*2?chOlX2{+Vsudr^o4oxCO=OBuN@fr&kHpg8 z7SdyBZRo$4=F$Wer%mvY^R-`(a!K#%*K$RY>lAV??x|3L|BEH#uU#~I$n{H~q{@JI z%x&1f_^cCoX^*9)y{?R zTT6Nr_Vx@*cENMMH`pzg3Q?vuvUIuafU^J*!JHDkUfw|-8Y*qz%LCK||4{I!LZ>FD z>+&-%fzN;{Ktpr$?nQxAca*ndnGmthe}#};F+QuURSQg{!YLH8;CgQQ(VV0q8zMMNpH2r$rt5g3PH--G^!@mjzz%ue` zefvNM!S7JzTgpGXAWi%ED-ISpWqENudGE`I@T)Uea6aE0P2&?24_LwMn`Pq_HA4vf7j;vbYdyN zmDFh|jD|jur=}sd(qWG@*N!JZA2~c=C3gR$6li9f^S3H z###y0wfuFrK7@D%1zA(8N!Yt+HD4YhN0v9%(o&L<9a8iO41f{@)Fgb8DVYw94B(WZ z-3_gKNq6VSfUG0P{?#Wdy^vfV11h2>L07G%`|>_uaq3>2g@WNu#lIIX5x6SzB{HzP z6z<42er?-7A!CG4TEQEQ9+_0PmC1FBIN#_ueRCuq=;^Jh(L!ljQ^>^}#ETpuXh*~4 z7_^%10*g?Z1v*~ZYBZDpBQai~lm8{KVSIpn#6q>m_pA!>9|Ym&!ynsowj&Z96~*Uz zGhpH_oHfPF!eY=Lk&w`{Ew4Nv)<{84Uh09}4UVi<_cEcOdNeH^^qSScfL!2PTPQ|* zph4GNXj|uMMr=}2RJ0B_R7~vq{o#QKU<2sE_ow*lRBFSIF<%HJY5aQQ&#D_g{Y-uz&c!zl=b)H2%&> zV*mIjGKxTj2Leayr4u`QlJEdV+8D-m6KgulS!ey-4md!e;V_Z)Y3jk0pryHcf+VZ3 zy1t(+!AnyTF}E5O)katbScdc+`~~%)J`^kO6pw7X#dIGM9kKEx2EEyM3m0Hg#|fft z{Yv;oqJbE0=6~G09*<wsHhlIc~F$hfP~`n7y{Buw&Lv%w?>bbP>V;jaa-g6{=ji-Ykx`)_(RHXJF;MpAW#Ps2a*dUPlACJ1QBtdEb)VbV?p z8jg&QFG}G*$1Tjw9Z)cwdJ_Io*WA{wV#@G&^t~U}#j)h`NG3jfZ}KZS)fp|16X-xk zG#lGx^HD1(ebem$^oi8p?kM0ONpl+EH2VZY@cjxUyakT){tEo%i(YE8Yg|IY*=qiN z%B_)cDU#4A(-j)^DL>*R`9SjISFVQ% zzjo;V|G*8)#{zTAIrv8;_8q=wdM2i{Tz6Mvj!iy!8j zV2U*yo^p=NMLIGu7Qe@Y&jF|1PDf<46 z2_|r#MweVeJ*N6%v*-V1t^MB3WJAJS^SrwaX3b5_>aBdY_tvE=XB;n4&J$*zUT`i z`TqwV_{>9JPk@Zhi`|{-m$mXiC29mq4XkLmKo1ZAb5qjTWFVJV4ugQUER`won=@zH zH8Q5biJ=kNGCO8~*$fKKUhncU+7#8MDu#cPa9XZbT`CK#eo=vk- ztRT$Y;8o{n@p4=h;wn0e3uG#wfnB;}+{VOpnMgpy6rc?G_F*RnUz*sR?%KA2R5+-- zq!~A(6cQK7|A7RZ;h!1Gp13O{J^E*a+)=#%5dz4f(VuvV5}d?oN}u0d)bRc?EN$_X zz2X!zJALAW9vc~lv1;&`daXMjFG=K>K6NUmi%(^)Y7^@mXL-U)isO$4`uZun9xQ4` z+%7{r^z_LD!)cmqRq@-_BOR4fAOKAQ^f(6qp}9?N(CC-z#tmjz&6skaX}ACh+hhIc z$?xg%^oVddhjsAY=0w$0HK>|zJq`{rgy))gSslLa_DJEzmAvpzpo(Gp_; zCxohnM||_zr9Vm`*A#xl*`&`SkM&6A?r7|Mc+91sOe0YwD9Fjhp!=B4?QY5DZrI!& z^=K7Nl)>LmRIa2Y&eDC)A%*a@D-#2+z}>U|2}C^kbtogMdFxxH5wHK*xiHVVu)K!t zm80S+^(g3P4x!y<{X!iUd*Rdz-1}iw_Wj~K_0^F{dEn*gapkoF_wFDK*yx<0-Nltn z-T2*@7!1U--L3&hYFVCR5`sBnxH1uQ8VlDkuwa15rORZcYsMkfb@w!o4QDXhr>qPu zzuT@>_aT=i*|(PTxIL$aXElQ%wfOS#^5A9nMuI7dU+cn4oh0JZrvuVKtqW`%oM&ig z80h%PqC>H@)>H{(^z@+)RP)V>Hl`$fKP3wfX0J%0dX+rY1oCINm?L@L9ziFFR1F}^ zxAnRMtx{a~p!V`4PNx!ME7Fvx1fi>7kl66Z2#_R3a?rd@9i;=PD?3TCw2)kIBW!-R z49J69BIfHC_r|=P+DEOX#n1u~h-CACZ?g)T{&j=Y$O|Si_rW~DMFCdg0VSs-az-9u{ z@4hKb4>-q&S|(Ip8`1R!YX{fw92{dj7&PrvH{B|B~pz?`jsIo)i{nb?#-v{7&-lN%4c1au1e zAFW%gJkw+QP10EVef4f1(XiC)X;H+(OCYBchg25J)>Xi%N$*|V;yNbvOe|m7XD^2OXXK7*Rn8pw-Sl8lHJiW-F=|oEaCZgZ z2zXPSU8X$ca^Tu&u^ioIO4pad+t)X$Iv$}uT5YlURRRP11=@VhScGX2p|$eVA(Yx# z^DB#LrGC_xFEJ6ADY(Bu=cNcSJ^dJTq4PFudm2T8AQ|r*cX8wJK)R_g%EFAk=lD7Ojz&8?A#Y zZJ9xps{Q`-0oPp6AV}kEIkqK!70$_OJ`q*6gUBbc04fhJT(4@-Xfv9cInPo0tiSZcoe@mdHo6O;BjAi;U| z7|1rj#OKWUU7Uk?DA+C z9*!v*HjbVh&L7|7ahdO*pZCw}R)}iQNL*J4<0(2c|ioHSQYR6Y$r z-J8y^qBp0P8rrH@$FC+dpLZW!Ch>s&aE5*KYnUN3{B=}Z*ugULq&_7WDwkN-JT@$r zRiHiFM%P978AN%t3Z{7wiT0ofw8J~%XN_VizOp%DjjOYxnJCa~8SfC3Q3+6|TJgME z+r7WoOr(T1bUC%+3rm@s_XlUQPpPWCwYS1MQmCa%p_)^ z;?!ALS-h?{&H(2>+N(AIa5sPqG#n5Ob_)`#i%WOK}P6g%=v{lX!OWIZ^?2nIA6*ZfIyIC|m|m@Qg?Sw$Rxw?*(6p zxR_Yv$Vw%DBq~PRaNf7BuW7RO^fd4Ji5>=FFj{0}ewNN%)8DNH;%9ti8KX|YUyFyT z1Rh9r$HKuVaXThaQYzhPeQ5t}66Yt^0;I!yVl8U+YTDZflu(eDcS6s~$ciHJS?^h# zDv=VWwAyP!po#iQli5S+Z-0*ZNVC)sbwVON-kSj*#4d@JKKoc`6xj_UeUiP<>|x?W=r4b25>T%bA=+K1h1@hcXe^Q z?NPX`__46CC@U!;g?|hRiqiDa?N}Jl6Fpn%So7h(WVSa;;Beg$Y;C0%U1bK`cOt0? z33*^EQUlD)k`gMg9g8wu{3Ku+u%P=zQUX2&VkYWH8vHlHq99Z}4Q-CVpDNeCIaPib z$_Nk3R^-=@%us(~7(o9RRJ)`+4?Ec#I#bEYd}CEzd{0W&!CxU>-cf0(Ic;oUOi@dv zA$Wa(;C^}*ho=V!K~f~hw#js$SNCku9jDRdIR}qWK+zvG67aU-pk@#yYVV?9Y6Wd= z*-xW5KJJY(TFoV)%&^wwtKqU$A^ijsJW8~2vFc$KWpHpp)MP^ADZrlNb}{3b`ydhK z_bZkq4u7<hi7PSBU)(^7X4b0nhB2N}lYM ziv8qN=_X5fx*%YY%%@A?vrq4}Wixh%%lj&3K~B`np9}aL+uW*eFCnp7Pti?jA76^G z&OaWwB{9M*$_J7yRx(LAh_k=@R-D)Z@l^4`O-2V!PJX`m+$)#E2BsvpCM38k&J^wj z5fcz>-r2$2d1qX)yS&~L#h6%*!eo$>`n~H64c0$o1I)8DU$DHJ_&s$nJ}w@RXh=ib z-CJB|IJd;*VChZvb4C6$Jp+%rt|#mV4BM!FZ`xcZ`Bnx~4o!OIwC7^$<}v=sFR8OTQ^D2p0TF-qCTXb^eEt@$_v^e;E(|~zT~;B28@9f9-hS$^(rbwDQ-t6Fs3q% z$Jn=St4nYwb|}iIzaa9zLwNolg8!q>@c2LH(JKl4{i5vdw4XVUOaqv~op0BC;!^gb zquS>~OBJ@aw`GN4#X&OFZi-n+yJ!p}w|gRSxC)d{r)$?w;*UA_+|D97At`wdo1?B9 zX$(M~r#Rapb?B3mt?CR(m?~P`&bETzuS;#@1DaWVP2-meyN4q2D?g7n-g^qN>Hh#s;azzZh$XUygq^ z*CQtv5H1D&L{j8oIYMI<-+5E#%Uuax0Z^>gT z1!`K_u_AEbOR=0YZ;0VIp?)t~)()dE4Cbuj&$KUq-3aq^X0pRK<+^%Tz4K}o@XsH@ zH8y*EhooOGq9m#iPCKNG@c-ibFN7XZ(Nd$}&yG2zjU>B32Q(Y(w$IRyj=P+&HG9U( z>>dZq^URDFDuFsb=J-M2a*(;4zKzcdP3P)*!ET?Z8wOs0d5ME`9eTO zCSWe5D9m;skEW*pSBk1FS9N&!yGd86pv)IRp30we{Uyq8E72 z(a}Zy)?$@nHm~|;``Vf+Vhg2^NI2{TtKRIl=N@pn0V2F@{pL0{5a`E2Akx)75Uh4u zB|42^Kmg=Uk^&5e8_Nk(Xb5N9Z?&rYD~y#=d8bP}_&?Z67JTvk;^kEopZi=xul)GM ze7(2%T~NW7FOty1&2$_e03!e?9H=P48N`S^9ik3j48+xj=w0ktj^`RnRk|Sd7V(U2 zlEWSRS`IzFUMIqxZH?~7oAomM`igYBsb|LeIiitdCqlhJ<`GGutbik9h_dpr_|IYR zx*^dv*DyIhOHZz23l0umRd&5wI-Dp@6_2I)W=#CcR(=D5@4_9furj~e=vskEdkd%4 z{1u?XFMKmjPHdeOEYgY$k=JKM*~l`ojqUB} zcWR1-X2tpW$%U0`P(oo~C_Z%5mu-UN6qJ%#tBBkx);)<@fRyz&xsFys))^mcqWI^_ zuPv^@{l#7I9vurp8^Iqxx}UBsJ1l(N*if~;H8#mp`xLK8r4{7OU$%Ol59sg_yOEue zALB>Uhl<7v6U2%9a2wq2#}@crvorA0^Jpk&B!7tHC`f9gUoDB?RL(nEU)x1IG!7Rh5eO`=RpB;^k6hk&a3GL#LE-RXlf~>bw?Msoj-_v&wQzw z7dd|$tSe}J``U?vL|A<=&f!?k@b@&64c6*G(>u?b-H*N1Mms02>!B*9`SM8!u>X1C zdOm~Io%-PS#Pi=N$p5~D|MPW)+G`-$`CttQA_!!B{+Gb^!|DAY>VVGe;t@28jPm`j zAos@u@nb+#?1Ft(_nRO3gXjgm<0zir3Y~?N6y1L4 z%<+59HHeyx&5y(Uj-&hY8;i{QBuc04AIK*l-c7NVVfX2 zpoq;oR@2^4aY@MaJEon>d3KYR&IP4gVH zMS}uBg_mb&cx0S@RgkdEOr>^YC44rs`5XZjOw6$zX^At-g+_au=BuUEk^O~cUw^-0 zspICE@uBjvvRwIEk?4-Xtx;=PMMX4b7LAj-`G!7l0v1QHW?@y%rv3I-3u!%%7EgAs zBha4M+JF{F!JW>vVRUP3xP@C}hx@A=nb_k|gj!&AM42@^x5P=+-DzSRD&d^d@~YTC z&|*ZIudm~fFAWV10)hyHyr3txP_bDPs4Xq^_L~~5L!$fL!v1{y>gCo{T=B{>^2b0D zqKRf0gq7vtkueSyTDdIN%bFJnZ01r;F?4Yh3z63ZDgz-;<$shA61%IAP=WHg+V1zp z0gYw8P+kL>{wobH1EPfHI?UA^SXo)uLdwY_wDzWgJFE9+(|jB~?$IzY#j?{#aW?R$ zr(=qg3zdOV5HxS$10I3Bb@}$3YNC&KLnkD(n)g>&)8p^wst%iZtAXZ}RVG6~tS;T9 zsE>Z|ZGUWgJ;@5F#3G>@lVK86iwEvn#hl!cBNV!sZhCSJMX6kBW$oEWK1BN~95R=@_bGp9!#^K&0KHG4|<+W9~0Eb4$%Qv>` zJ@|syUXgOoPa0kB*KXRQX=qqDEB2Y)`&gaB2g+|wDH>B2$}M~Q)h041g+gKC1N*=8j@ndl}xQG5v{&c$Nm*F{-maUfGw|9X=2WIeT8p2 zOE30I9U<#M;o)4aW~i6XU}0}E@UpYA5D*cE&v!1`?&@eYmY?bKM?N@dbUWYLxm*T%B`3dd3Bl)yRUwWc5VU^t24@?zIi49# z)knvSGtJ*kahJjx2MgzDSe6j;*1JLE`srSp(4pDgoW1M=H$qek%gKr_ znsXxP9sN1-xtvhRvdf#x^v&{4tE<&u4xsC%qvJ-zcb?Y{FHVP3sBsb9R?4SqETF|J zr;Fp>f2^^G(fn8(HVOw!`zmg1Zl=)F=IvJ;gT7#Moza+L!kltdM-0ePF-*0N;4g2?Hh z(<}Qn84@!z6sbq?1dl~0O+UnIOa>2E77-;v0y7w;G`)S*Fs|aeQi%ZP7oF%r_9sv>5u5WBM=}vCh`= z@N|aqM>5^m%+7H7Lj#Y95T=M1f&t`2qg$21IfI${j50!%k2&ql0e^;kHx671Ia~Ms zM79T_b#h>kjW}M{&|t1;I8j#zPDzL2Ex!ySz5w=)YS-^;N|x#oU__O@k(~UmVEgAq zbt3>TopclLm=ac)pFdF}_xN%hs75Um-ejKp(aV)3w^32rZO2zY(OJ{P(mpok+~S~Q z&$c{qwcfQmezW`fDzulnGf_W|PD)+9;P_(-#uwsV&uke9iDVX!qh-o-C+BWM*#p$N z;U?G90{yrbKz_zK;o@7p)k)b&{_}Zri(0K{J<=&*7Ki<@1o!vd44{O#6q)K>b;@3T zppHa+RkFdU;9qrgpmxz9>8V0CYWa-CctvPd7FK>_%qGn3r$acQZ-Qj?WuvlB{p6Mr z23=0*=>2eH?O?^x1fUy!ILBr=S*tl#w|)(q>j(hcZgp}*{f?N81_N*_nSB_uPs)5} z94t<77eCRDZirRkI3sa1I^t(NLuT`9 zf*bHC+NY$BBG)yfHe~KL5{TddcF}V;e2M@)(%iW@$_;r&NAQK?cLtmkVD|r;(PRrR+5zqVC$?bSXWyk?4*m zXEWa1R8$s}ZuDO3pWb-CE&kN^7fJRfl1z(0n`|&gxdz;|a>gqqfk)7zl}#UE38SMI zkKQFrhvKp4tA5UlkN5MhGjtZ=9E&DX>0Zu0UvP9POL>+syIr^r+_(Wx@aR20)47n6 zl8S`mKSM`ve@8=2J(jD~EON>TfsEf@F=5<&DL`Lwclv&LGPLYcu}{EJwTCZDCTF;ISv&R$R&0AOe5Cy@=U zj{)Q(gZ5Y)O}OoaMkn!59A zu7V7xapM;;_rU8VKVae%UKDM6{vUY#*N~=BJxhCYO({cU53c{{hNU&$)SBi+x=Lg@ z0#Q;-PpZt-=+zl=#m%W^*((h+qrG$+z9jn;`A7h!*vhU9fdU`?Fm+5PaSBX-oGR5B!QD3k*x9Nk~ zo^7n&e#vBUl$ivkiS7N@x8g1&Mul|s;})uDXn1`Ia`Qo#HY%|tRNXWn5FNMeHww<9 z=h8y0Zm144(_$b-)l9|s7vQA%@vM`KLzYA~^THgeGx*dnv04zXyK^Tvia+ypU^q}Z zRk^zx;~|s5)mzOFn1^N&xKHg&pg<-BwF8*|Jk%)-52>@6?KTj%C$<3uIvq?tEO5Y<*IbD?SC8s=7=wK%$lYhSVX_uEH=QSv{oYP_P<9rrtot z(&zb-!BGbZ3yb`GkCWQYJt5B|HbL~M=FVI*96n!((S4P>Tauma$)s?u^uyvo?%2f1 zy2}R!C3^j@r$$G0kNw>M_`Ew+9jegecJ6%tIe7pU4c?jwq0*GIz-G!=Tf0q)*`!TB zh$gb3Xzj^whUQOnf1?SD8y|mi-_Sb^M;@NW>@OCG1VDKh826OoR44A5=%Z#^xG!?_ z3Dt?OEc^>_-&_G;%j5jAV_KH*R%c*Y6-6ei`UvqX375Not5&wAXfcAnxVkv!9cgxV z5kKw=0JF4&nW#2XgIr@`^m+Hbm_!F-7m$Z{4$R?Cd98ra&KJIbp_a09s_yYisk14c-Di zlZg~5^BG$ja~0+3M@$6kU_6lP5>Nv;9`6xPwp6^#5QT|JdmBO;%ui1H0jzMCXP-wP zR&DeW|8I!&2g(ILc`2p683Y1pmWi+8;g9byXwFLBJ&B1;tv@D}powZS8*&B;Ma-HF z_;kJiHF1!UeH?K;#_C;wZazegndsjxq%`HW>v09XC&}Q{1ZbIO-2fOOv%ANoZS79& z_g`DkV2^&1^Qd32GU%OcK5!MhoXi-9pKMZ~Qx(+E&|4q>RtD%cF6{IcP1c)hUIQ96 zq-~v@8a1{FFHv5jU>2;;In7K@n$K9vN+=|jM`BR`1Cm3r`gOO_IbWi)9T72Y zHnC+?6l7eDgCK(St~~;6p62g0l~x#FXke$(WuBI|NAr0lp^aqGX{Y#0ED(^d8F2wk z_?Ec!p8$z%(c7E*VnE>lOXyh?-?~cclL3v_k&wIl+GFEs-S*KqI(15*%7t12@?+Vj zVL)>~>1cYWyjuW8FP3G5LV%aL5G};QWTF%%Hg^S*CI}<{?a7z)^c-(0BN138aHB4M z?G8{V(qg`VzQ5{z@u{@O{#~Kx1Fas{7UNmV`Z6nmjQuga#S13zC^>QS)ERw1-O2kF zVcrNKvtxGbBTj3UUhF!DT`nCuK2<+=>^vy32eI>St|{%&vy04k8p4Y=um6kw`U!zO z`2|S%vF%REZVv7qgFA2r2R4Hu=R6XX2_)W1Y(WvgG?qVW0Zf0?ch-@yTL-(@N@C3P z74^*^H8wbyEhsILh2mH0)92Q3$2EtJNyG+#!sCea_|rG?YSiMIc0S~(%NU=jhQ2*R zJveoA3>ZVjXC4^OVg&|Q!Mz6IG_g#BW3H&AAE=($-u9*(uZ|A*=>tAJzbTeDllSlM ziZ8J>lC+wfA*Map9ij0QxOxCR^2eL}b!R_-M`rX|qR4V_NKRmL3miQ`-l^?UXmP2P zHm1hFgy+f1ldr97O7XfdtK9@|txWc}wZ$ZBK_o$^+vDb4XGh$x%&=v$97Dh(*u8;F z&!!a60)X}(C3?3vZj%(0S~adTOiW&;!#9uD3n4^$#LR?ju4wHnAK-G|wF9tEJ=)g| z(VP_xgze^NbfRdOY@rFgCSzvY zvT@F)^DCf#Le7Pcg4b?jKnbOvX{mAU$Or zWlLbUpnvw1C#{;s_!}N2lFeF)kZP@1Iu?NFKOd!BUHNBNQtlz?(32YgzHn-4xnYq! ze-kt0ZUzB8CyN)k7UazJd(M#>A73~a?9!Iufth0owB25JbhPW?K$Wz$zm6E8PL}#0 zhQ6!@vQ$X{GgDWGSw?m?(9=O6jAud^#P)^{G282pW6%QwW|eQN;x*m>3iUq|KSRfN z+vwU;FnTJ3!=b0${N2`y)^z4uQm2*1;h|M~5ax0M7}?C*l{qV66Z|w%Z!k~TH+Z64k=Eu#010FkJ_ShE|1pbMSPxuB4xC)lf<;mD}wji_7Jfv7lfB8OH!W*=UYt!0^Cf z%-@eY40G58w^8C^p0kYI@;QDa&xVlk-Tv}?U1ckq1Y7KD#`;A;QcxZCOwdja#eH z;;t_O^6V4x8T6Wvm9VB2T{ke{QBxZ?L~Wxzwd@tLC-Sj7+PA!=k6jRA^^3j zt}gjTzb3=fFKlytpXL4g>QxE$M-ujF#N1=89#2@DiZ5AMtozF|GBcA4Qv=smy1OpG zx$<}1XJ>*u5{l-Y$u#sz;$+`u&Wm&3QkyE*EB~O2ZF;ZTfr{rn(rP_b6<-_hXQ__p zN_7w{8``+FlW=7`jRFzje<6;Xi{iMtA|7wdmbwtC=oL;A{c+&V9mK_5ttE^vUZ4@#CHy$`1ja#ZU%jMpQaIin_dd;a zrR*T;kyYb~>(#~~Pft5>6J+oe9E-x|6!I(3Bsp`2!S=<`DygYZ3mVoMq=2IHs1vFS zRBv*On9Fq(0jG|C;uoJK%0=F1#qL=zt{VVFVkb;ltgbj|Z1#YXcb@9VLWAb>? ztnfB*nBn5W@%sAurXHPdf1h`u$^90C0G(XFz%|hfo*~g+udUw9TJtI zTOyZp7$V_P#1Ba+gD(4xDx1jSkrYzTBfsRlm3s!$?#=y}hF$U5b2*sjFq>>AGz6P( zBj>qDDo%XQWzH-tEcUHyPt=c=Z5ZQ|t5_C-ooeHg%OV#4;Jpn5CCf000xUc zUQjV3d~okS)s>i@kC#m~Z?%RdPGm`5Z}to9=i+3;fFnxmW;#$3m5F$*4@Nha@9t2O z41=vQmZecG6s=Y^J}a3*2w&Hnw*{s*Ra@4|y<`VV8ez0$As2=i7l?#qjiTuhJ$d@H z$>lsulb7r$l@L9+Wcl@{Tr~e!Nn-Vvo)2Ij5qt8}U}(n+(729lGXMDSU!VK+5>fDQ zI7{lhTU76Es__=GFEu9Xq^c$DRjg3o6vXuKZ13J^7FBC!y0IT4^&-w%vgHCvi}4r? zsB}?-4&nB5!XbFio-JDT?C-7`1O?6+X!G;$AlFf0VqmPi9DCan;dKaoAKCR^f7Fvd z2(2rQ<31<65uvNF+E-})Wk-3CysZkhkS174*UE4+er*8k5Eu0()oiV#kTOz+-o z+pJ5=+^o8ANRVr@pZl-rPJ0gePIZye#YAR)YQkI*{r4z%DN=rA_E9{CjOhH=N(Dar zN(xVCd6i)t-%@)2l&gxHo7;7g>vp2x)Jl?BcQ{)@tLAN~Jb7;ny~1aJYGv+JFEVSl z-5eInE0Fw{r?|6pp9H!*X5{-I$F9``OAf>>J+X{hwLlE3M5DrSX2<2W<{K;@2w7SE zU@ssmtG(6-UYhrmjNg$K&Uu%aMMoN_SJ@sM7ovxP@z4ME_r!U{&>T39K(3hKHrG5l zQMnjcYMghi!{-T#)oU?geX6S+0+%u+VrgjM#A9id3pESvTfF^3`xLUJLh=;pmJEnO z1%x!`8w}hO4OXfAzX%7-bkyXHN0Cr~FcgpO*;hMr#K{d^vn1_1vEd&gcJ$7%fRcg^ zc35~!C-Sg6d-7HCF+1_7!hTL=h9e9FH{+uE)6;+U*v~Kg*j|LxvR~d%E7V)g*I5Kb z_AX@IZPavp?d`2Mj>ZCbtJqwvcsP4k=lL8;Zo1jhR@KfOzLM)+OI}vYxb*c^cm!j~ z!UQ{7rJMv(QE3H)WoB-=K8*eG+9z4eR!uQBgpzi%wKiOu5M2H0Vvy;00o2xg-j^4w!bPc1E z`t-WPDpX*tFW!JMOavvN3ny+TEs|P+qSNrLByb|?cNecUx5P<09-~syg*RH7H-zN-N(u3ar$K7AX#gTMx!0-k_NP@+1_k`dYoFurrOM*KD z_kktBU1o4daQDFnk`QcgAKcyD=V^BTyPMp*zxRDUKi?R-ySn<+sUz39stWT-FF5Jn zeYSsHytq z@qg&J+#Pq;>9`?ITk3qc2IE8xl3Y;7bhmrA@w(g^ysN3(XZrE_dRqca;#f=SxV(|w z*{r~70?DL6$MNDZ>($M1YnK6m0;6f$Wgn2MP&7|>48|AYFD|E5bW?+1zrV?{SXo8-Mk_C0QqPhXx7XgE zt$ptr8i-@mj5ZBIk_x$*r~9xVc6Lr@SKK|#MyWF&CTmk6$l5*(yj68|AT(&y>AFhPnTYS6f#6GY*OV{3OipLT*d(AR!qT?TE5{(qx+%E3xu5li|6qK?s|naR-Qzg zU<$X~M6Jdftp71q_bH-K?vkMEkfe{Lb08I(avh#8pOveS+gyDlTc@(-Bco)dmI=k{ z8$~x>$|;4huN6~L>rLq?uw!KkVk(4ezZ}eZp;DhR!yw!|$zwO&fzQjC)eczK!-x7l z#|SF15HZY`V%W^U$zMaSvQ)*vM<(}>WHaPyw5|Wht2Gl)q=m<^1;0AiKSy^#?%nLo zbWS}$ATtU<)L9jh#qv6WDg9^1jC2DP(ykQx9+3sc?znDEe9@qt|7;gw?-=f9EcAZ% zNw8GR_Z<$?ZdI6WHnjv!K5T2c=CIDpGaqh_S~$u$MDHzpGdnh24||?npjK?c(BQs# zVaDgmBUWD=c*kArvi!Ty$|5XHIJfj6q z#j|H5#(Z5FfF;-|Wvg{XmI&RW^2Wpoy}pN+6P-=M>wx98ZF4#;OD*$${{0RlygfyY zVT^Tzj#{Qz8_FXl?(Dv>R6gzLnuV~EL3&Q#F@l6xj#%0vps#63gd7xmU-cx@r-uY{ z+uwXc_y?YJl_^KR5nC4MJHl?y6W2i{{7Zm-s?VdCaZeJN{}J*}Dhu*E$1qDGI+xtO zmy>Xa-@C>JETQTG11E_sQh6b7ny%5e)!9FO`c&Vs%O{n?aE!S=lv`?WswCLvteO6t z%ORlcGh4~-GGFz3e>#r3EOf*p`Vj<5o=-&&C)+Xw!pFGs6GL9g7YU4Io zx){)W>QOqAnp9JwQ)$~ds=x(=Um?(EKM9uk*1_JOik$x2>iI1_pQ|*S7jCH8WQwgD zX(tCMM@HM*mrXz2+v#hTMi5r!+(kq5&?WWe@Jz#xv-4@$om1H|8&-pWAkb_Vrk8WE zI7pzP##Ya{&APg}+R$z0m(10z^R9+nRGutP5muqZtyk{^unC7Z5tVki-R-whXSWev zule8+;OpzVflW`dMtJ{UgDz5ld$zENL_s>T3j=!_M;YriR|>v}Ald zDhB@Kw$#2`EMCF8ZSCWbXUsXww5-k(qKLPmksltzoYt?D)6W_FQxdfbj1jVljjEXc zc>FuLD2yETU2kaqHTpnA8v|_MUvUE>$KsfVIp(f9824F(#64X)PKeEo$dx)&j0|S* zx;l$a2a#GybIA!O13g~pem};YNG#+!vy=5HFXZx4nBaVXb!f*m22pjFs$gNC%%D;{ zH3YX{Z6xAyUk0#N9$`a!*>HT*zU#LN8^fQiCIAwxY@{tqry;8I3%^KltY*h{1>j8r zW-~``kvn55lhB<6rsM{CjWx6f{}%B>yr|rfb}t^w^ANEdAz1&Rno5|3rj#;ZqWa?K z3*Z(*^qtVhIm5M>D+%IY`7}P$0)YZJ(6FQK-XjQD)R~%Ru|Z&3&t+CIak>;cAp3vpe=_u&Y>X5Tnmpr7&g?8+OvZ$hsb4#N^vR9_XeG+nwHkY6oy zs>rXDnVi2?&3?k?Gv`}#q|=e+=pWGNg#79)IQYzGZ|)|l=Hi~Y@61f1rwd4|PrXDp z`(!(9z=&C&Px#%tK64GCEkwf*CuJ(-+W=T;<7W*qf}fP;PnV!v=Nl$dq*+(1)8`Tk zLUxMoOQznFBd@dxx68s~RD2HjaS!?QWsK_`(dLnFSDz|j=Q}D*8IMnJg0A1mpF4n2 zDZ>loQ-#WnR!!S=5)30q1UcXJW8|Q((JJU_8`g`-vV`KZjLhdVd*sI#LatcntpwZH zyv{ma56vkwNf34F6L|FMPhT$e87M|-H9GAi>zSf( zvbX>Az9rwcL86AAN%p@~{x=lnyEjRf?XxRsK=nk@Ya}W55-c=fpUm2p6mz&Giw+~k0@4}w)GjSEoF=)xS82(zu_<9$uvm+nv-uJ<$b!X z0JWZ611#_I6v$oyVyobD)Lgn0!Jv$0yxiG{#$lXWVfI zsM6VosbOoa)btFu!_n~FHXo?{zO|Yt&Q!3Wu1w!kHO;nK^-{f@Tp0gCcz$y)lS)n_ zS^hT%>6W9!*mi}p2~JZ(W65tohS{R?R>F3?N_$qra3O2AHR;45$jq;L6Wrb;4s3I{uj`pPf!8wu zey+a(DDeg}fDv?-IQI!o7TRpeSBz80fQw>i97EU_pX1cB=<}^^jKCQjX9Ui@>@J18 z2y9zBlF$)#&Gz$O82m5qF9d;QxVg zd%oglhu$q-R20;{6u}omZJVQ|m>8ACrBS0@WPaWT&JGhIX#(!;-j|avuhWLiu37Y8 z{kzpr2Qoabu_?eQ;l1}p@d4Az>h&rt}!E9LXM+N|##EjD=`TODW~#?VTebwnvC znS5ntnAK;+kN_Tx=C7nj{$hO2Q{$FmoZo^FaE*?4p>gyXJMQXr!T2 zp<~e!1NM~oqO6wTQLSy%d|hm|8*xFhH=l`aR0Eux=55u{!8FO;@dnKqft+xJ? zsGKg|2d`hh_9%D^rhRAxOM0HXx}H2w%ImS0*xSc%DxZ?z<$kD;#(5bJ>(gtn)!C*{ zK-$gj*EH(ZyjFL;dUbM&xt@w6FzTH(52PCbn4Vm$&lO8o_oDp|28~hz;E+_MHEDZ338#5T$|gj&`{QoA-)hU?VmqHx3SNzd^C~}xlFsL#D163lXfi< zUYIi7&_ttSBrsc;8o~>p6qv4Vp6-PNr>Oe+GvuJN9xH$SsCj%fX)x#2O?%%#3_+=^ z{`>iPhv#|HYJLEL2PC`#iBJ&!OrA?YEfY>GSaLw)AQBOL<~2MXO^!7ZxCL-`O}1pn zZyY8H$oiV6N`XCHUWRqs!S?mvDYjGrln|nL~Lpzu)v-<@LYeVp2U1GND>6QFpM+|B^*F8cG zQ<8qWjxY(Uh%IpHJtY$xK-N?hTwz=@(DMkcii*)U6RcMq8z*WxLy z`@vGE#p{EAnQ@VVCsEN~u(ul+^~wf*5TT-qLc`6+aw=6w2os~d_esYH0DrmH{-GIk z_7xKUQX^LHTn#ZdH)rl@rkF^I%Knc1;CjH}>UX1Hib6Xs@A~2(dzrm)f|Xg93GsZ6dzMfzKcc9wUNoz zwvVl3;ur=Y70Eh?M4mdUHYC2nqrCUGNohHs&8ht-FZ$`Ps52DyWX@ZL+jLV4*$Eas z8D=a*MsYdas}NIKg;@@x1g~hR{+{7M(kvtu}GCD#161DaIA1LPYs@1pix*1i4O?BahfXD z;ZZd<*UWoZ=v(x)mI(`bQ>eB0*oyUH_bqxeOCUp@*N&_>Z_?qGa3!#sLHEEH^S_S;t{48qx{J0ZFJ&N?fVB5lvHNoRIfR|^j$KPfwg z6FDadjlOAnUSZr>5JR)q)-qphIdRV{i-C7Yq>b}P;CPd3wEd;A4!YpTvj{Iv(BTDE zp?c{tq5(7({}OV<#>`71(l$%PZ7#^5GNt`A5Ym-t&3a_L0-pp$0DLK^PIHpI3o8-zhUFd|C9`hEge(&F&W_ej< zk=dN zVQ8YD#F380EA=;+f7#xsn7f*$Q86=e7e8V9zJU+!-n+-FwOnbuTyYwRS*l$g5?GzK z@`pCQx*zvaM@b)hAov+&EDUh$Oe}Q<_V!VXWKoXWCj2$Jbq;25qGcB+cHC>}-7#Gr zB|1?9_jTtnciDXQ#g{C>ar6f!YQCTrR75b7CzNE60KXxp)4D#NtX(GY&?U@?W95hb z^;EY-7hT~U6Nr#erjXWyxlK=*lke~-HlwsD_oht!5AO?F*R#)r5eqNer7Ydt5{5}0 z&85@XOcZHVtaPQ)>rEu2{$Qv)WAn~Ufrd!_<9i|>z4E-7icy2RY|U^v4su4Cvxff> zVc0WK=n&NQMrGtX2xOAbQLxm7rZ~H=_SztdzQQMUpy{kk#Qn)iFkM>rb+(B2?!H3e zef@Cz(O9PR7>FwFGa4gZ9)JJejY;S5XcuRTpaF5)q)GG@R9VV)`+bazfz`^y`DCcS{e9(cflx%>P8!Mn zsejxn@Yr)8tfZQRcW(TCoXjIW=8Z=rN-xUy^T7ORm!gvHGV8+3i%qrnu`*h^R zV35ZAJ45mte+UdN)FJNe-w`lS@XJHvwwZ0@JUIK^x1}Y|(c(onx)s6Qp9J}Tvhq`X zNs%cfqrn_o{F2*da*GUQQF@0r`p;qe`%7+H4hjZ_!|;ok{?8w3|AN93PS}pGTkAAn zBL3vx?E?>%J&ihK4UeYfr5s}R!}$5>zxp7G4^(Tv#5%tH>7sH<3m~7R`;pBk}~r5->3Mq#BhHKXZ}Ed=9K5=qCt=%U8qxh-*Tabt|`sq+a5m<#oW~l zjHmj2!epr<@=%FRM&4{u44TyuWpmcyE(dn9dYYn3B=z;{SLO8m-TCi<4v9d?$m;Iw zF9Ps0W&(gwMayor$Y0R{DZ}HI$o7{VsmnZCJS{1A(CoygvI84XGU}Q2|n{6m8f;~ z__RoSNZw2bexeo*KaH69I84*ZRf+D4_bAHAe2^wnrxRhKVgBqz&xKxvwxfhsWMi36 zXl+nXW$Oep(P#G=XLb6MKU@;SP zsftcaUz?BmeV|N3ItI0WuJ^Uemf7Kr_O?ZFPSamj>Q4-=;%TkdcGqCMP(_Ofn6Neb zL(T3P7Qs!xemP$S*>$GbQoGtFz0RiVqV1?cMRfTIIlNdZhK=g0a_?D*!Nj+m6maCw z=`x;u6n)-`9NM?cPC8ExR}w0LP^QCyvOZ6eCOiLA=tVyM8= z!NwmL{RV#W7~;`aOiMcxXABz-<;~KPqherd<19KIdO$Tui;gs0*EllUE{zywx?|~AbVkK=kX9*MEQuz>-)b<+COI)g&0QT_5S4~67iTt4-*sf z2-sajzu?OgKr1OeqiuoZXpVaF$bWe-DEYSjv?5{gljbTHwxpq6DqD@M^8f`2%R85H zN6K7islQgP4VfMpSYaAY|Mu{|^6y$a$yL5Y)t?##YVi3Y!G=Fl+nxshSE}c0j~@QhTY%iqVsY=qp?9{%PeP-qYI!N>!2_K0GpHs6xTO>! zSGtIrLPe+-xI1wuONv-YL3=To_8B2AA^6$mr)ZVtFv;>PZU0f3O;;=0gK45F8r}w{ znUJ8A37Z}bm*B@-Hr}z|Rk}y8ha7jAc}N)f^U7JtI(&vdv!MTdaOPs12EUAZTrsls1G1&DPnm zShn}Eu*t|qxn`&8BC>g)%~CuoH}Dt&rm<3NuKGl+Vgorr21}7bX@)+HCI)$~pKrxb zu@vPqpHWr5I(N7kJP%d@#hVf#C#u(tu?|T8k?8GT6krrb`aS{A`4qN_CGt6PxHeU& zNq0iI<&UoDuR(oYqkW(!pAE!Ct0R7NK5d=S=fVMFCOAuYzY1{qD36jD(S)TSDwb`!p4e~Sy@25EsC5vWk)1p&~6Fqh^ z%?~L#(yHiM?VaW?x7s{S_g2Z2t{#uies-(m87Q9ETO$y{;Sx2L*LD)iKWX(Xwdt|> z#2TBs_kMGqak(E5;UKD*T}b<4U-k_WVi;L9=Ki~MLBc1C+ zAsoyF)$6st5SE>&w4IdrE}W;5sow4Ry-c~g=Q%)b{-y80Pf>bct`0ni{>hO2hGL6R zWj7+PUvq5>@4Q!mG2PpQtksEIf9IOr5%}*WvPAq)&L1{TY?`=}3i@;tXz{u3dp+Eu z*L(9*0Q7&#EFkJTgekBT{w49Jj<`jL?C92cx^8mWZ4_aJs$-F|*6OZT3_w%W?hQPN zzVquuxNTGt3nl)gs>f+~4fL!i{TNO@oxTocvVsoxTU`YGyN5J9x8q;q`tjtafFsJ` zuCd?Ovax$`Lw|P_Q`mU18PJO_JH!}RU!as||M609lt?b~t7D0D@|pVGc9wRi?I=6L zZ};NA#8Cc<1^i*j6|<~02Kx(L;SESaoJd*WZdkHXRqNKD03B_IAWi2qrv{A=)N(f~*O zq%Vp8UmT3T?)bY)|L(@^Em7UK^6Z|aNcxYt_W$SEKYJ#!02FU1_KWJC{=e_`uMKzV z0&?!W!usO=-+lc5DSfyg(iX zC+@G)Sp-N|TO#lLGKh1cbNOTn>HMHx$vm~DmGq~%e4TO6m0&(Khf7_l&ub1Elc(xD zPX}BS2)1{oN*2wig+?z~-CE0zL3VKoTL2})pj=Afpmt9|g{~8Bej>5!)n9D{`MCKl z@z89RWPcR)7G1VRQ@~q4m8e=a`Y!_SKf3EHHFs3ULFx&dbKn0&@Ti*S@@UJZE7lwm z##oo3UuC%7(I?E#9!7j^6PnX=)IE|cbigXGptm&+yeWh#-BVgJ1Jk9l!*J1?$gskMKx@WD*Ld4Mqg zK%D#r+X?9TS^m?CRd9$@iio-S+7e-NpY|)W#V;eljuMd#@b8>ByHgEToRh+=EyF$_ z;wE7e?8iW}E(E7Jva&4r@sf zJ2Wx7JCbSvk^^P8haPll|Dcf<5t;^FY~%A*nDA`Fqh7h51~q*Ac(-sL9Hn^Z7TOq$ zLoUw9asU4PqoAEy^Q;tx3=bl*I+t%^E44IC29*cZ^Zwt-1Ckl?RbgLV1jzjQR=*{d zDCTpX=%0faU?K)HvU(2a4w2^erCNkYt;uL@^p|+9_Q5U(XJ>^!?}}aDf9_WbI#V|= zilUVPEB}z`8aT<8iR((D4eb*DTQ~nN^}lmBd~neQHU`@t=XC?Xs2<~B#THF zGBd4|FUGrI1Pc;9xo{bd^rd`6LUQ#LpM28_bLkN(p;lMwOASDg75~TW@4dPs&#_lq z6L#B0Z?5kr+{Gpp^cwPblf%e*X0z27Ib3A3^xnAB_y_GT z4bMFWh(LQmHNoOP#gD##nLR7gqHGU>UoSCUy3P=cx~^?Lfw>}QzAVdlI<14H!j^OKqE!nda$XC0R#nAlx{Az43iS&v!8g}5*2CCEY=@E;MaDM`&{?812d8Qa> zJesH6q+U(be8r|!QG%$w**Lax(0&*9LsHV`HX&e@|2=A%sU!l^FcN zbLTM#1-A=qcP8oWK2m%vc{P$0V-=Grpn6)MUQqPye_98AwvhTI!)iutCf+SstCqeF zRNoG>H`iS2Mys68_Y>%p$auX}78Ax8&!k3oUd6y$^D#Pm>1wB+`4p%IDw z46RZ8c$mWL>M{nK7C%jMtFowfTIYjW*y-MQ9__*2UaDC4trMVF41N^m)47X zyiKNm)$X9^mfuZ6-@JJnM%6=Rqh0CIqf9GJ1gTuhXXW=SnG6UOL1 znk-@UOMHIE423X7?8&b_IZN3O5xSaxUdG*%_UscgY1)nR#WR1Si zX1%F(-t7wN5=-#d8rA(6LiX0Jx3ks6V|@nokDfh?LrgX(`n?f*h%2AWZ4-8}N&bk! zX0O4M{_O=OP*QxZRaDE_c1>hERquYjv3!X`CK+w5Re|?HmnK>v8|VK0SI#@#sSU1; zO^wo!=c)9rwx`4_yt^~at`}R0=r=eOCOz6vRjqTP%!U0lEihI}v)j$Y@g~6R1UwI; z+80&6(^eDJG=RC#zIvs3q4PyEc#7BYho?l|qhI~eZ+9XeB{X$7NSgY2otr$8 z64G(Cm_46NZa$q_SpON~aJ6K* zk=btxQ2}|bE}#*S!dCE_9sc3bDsu|Xeawy+UdM8(A0G#)4-WUGhp zCz?dAY7HNsf+V5}5m+)1*myV8&;PA2Fh>(EuS-DlKBEQZHqz;3ca{zS-qCGIix zkTt2Npw-98YHPu{Y?!#FcYK|W#dr}~rR(-|F?V zC|a+}4W}6r+ebK-I#YAShD-$;oIJBkNd;%v7>Cj^yz%<9IPFgTn!e?+FLk74jyz?$ zXdbRnu6xKZ$W~_5SAxJ;y=n4>1F)`az8D6s#i?=fOy4PkR?){7FDBpvvmhoe8E*G0 z@0BbIj~n6Do}@~1SZl|dpnU@$7Qa+a7hqE>3wW;$B_&u6-6U8noMFUZFw;?hMan{N>l6d{J>6 zV@yhhD$mp1W0Jj!=!rVLKn;&0?r+pWmwU5D65q(R#p!7r&L`9YBS^TFh271Jw~1Cr zIgM*RSyTcfpW)yk`f%$u4fob&8~CWV2TZkI^aU_tjW%N^S{SRgMKdL*v3K2Yp9rdhs8>0$8t#C{@v+9qY@ z`%1j-MDEKzBl;T^gmHBMX3_EPbgq2(1jC@k+)k5cteI?TgYnl-GWco%B#25^ODnat z+Q0S!7{sE~T5xP-JT4srlwGgJSccTn&K?4Vti#Qtd}W=I6kcQks}>464mpvq z`xi`yE@@O}NZ0bqF5k#^rdzfL}^>wZ_84RFzV8yyiQu7al71E`+j8G>DG;kF^(}AOqsXN0AAtzs%t@ zrm_#4vj?@%fk5W%bamo$;jw(UL^K=|#UI7wn?l;4%Rrwaes5>xiuk>2KhJh5dihjV zhl}ZbmHdO0CI(y4E|pz|9FKTQhTO+d59W9z8lBhjOrV+y@!_-TcDnQ|i4U)BhczlN zlF|ykHTNamtk-1v7x#@8x;;3XIZ17BgEbl(+I7<#h;fm=bOkM5f6nbq8HE$gH=OSD z>eSoMAJD!*f#lWDwFs&^mVqP%&_w3gR+TonkPTflcep-i zRZc7WovWR@u`r|45VWVl!66Z%k-1M(34w?2y?y(A*zDY+nw)Ki)*v!nD1RYb65%Vd{hFekwID@Y zB(0a6_ri#mc<31mgnfZB@}@6^$Fsk%wDawWeEK=Kyn>Rval6tq+vr1Z^w%#Rnq|-2 zC+v2ye018xQ&yrofGjBT!sW=n<2#d_`;ybAa^Xm%V{bQkGts5_wRnzd-IU|I-DK(+ zuyusg6Y4_#r4R1z*PP3c;MuT3jfRpTYbCLeEH+J5s4AgKno)ZsX}GQug{SQO-NLs9 zlaBAwUnnTr_a9WhG%<*TG&G+L@Ka~j!r_c%e^>Kc6xbCf`^#`CNF-&x7xIyOeo;gwtU8d{~n)!~5Q&TxIW+dPONt!diRaI&6X%GR;9tykit zWg(Inlby}R&|FkaaDA;&(F_YIzdM*jZRnHgGMvz7vZCBn?C{&C5_;SnVW$janO}rs zVGK=Wqn^Hx{vXK@-a70zB6JGLYGLHfri6hv(P6-kidG;2_Q2W793|*zpb*CO05%a9 zAUNwy4n&5&%Am$WqWJ%=?~{rA*eP7db?+eMb>{-@azBm{!$h7+gvuJvma`W5lTaPS zyZ;KEob0;c9vMb(X-Q8dV+>{woF8xmR0bvuYUG+0H8Y z4C(RhIB@Tls^?plQ$cgbP0p5oj2W95g4PgHW+$Oh1VCmU)!|84@@cnXVl-_)k`3VO(3W>Zn3&RE}Gqyt~JuHnUA+RR6oEUbi7o=cSgl1lD1hTc+zGt7G@SH1@F=O02f%ag` z0JzKB&Bfh1hn0)7)My1kgt#1u&>n~)w+VFqonX=0Neh$0^H6xEHh9J$ae4c6R0R1O zh+3KM!z{@0EE9_;R{_5~NAIK~ozvzZGM1i5SfC=(CpI8*gUbuR6fBJ@8E{{MSdXUL zj)_eq?Gl6HQHL~aH4lG`+YO@x@6v~Za=gu&?tH*W7X+O&$F^suefboM?cpr4X{ykb zkH&t7F_g#niT8HSaU0VOqFLf3-DfOXch>~!z2^$-OQGq(nOYvyur>-;Iq1j zY?i~sesH2y`AwB|o>>Bftlk1cfHe^_>RGgH^EgclH-pK!d>dqehad9Z%IS8xvc^OW#Izl6O$>ne~S}DZgW1m^{3HS)}K_3tRr}d&#IX znt1aB)cpACs#Z8N{B!g!CQ(l0UaQ?~?#-Tzxp^K5yJGW*k$BHEpKClNK6Q%RBnGnd zlcPzyhx2K!b~Wh)63cyJGAR0bWjW)GJ7FOPxa1%@7b)!J`LT<|PmOpKK=r$V^0kTs z!WadWGoiqtOW(3X*+nx>!lEd1a?>~BXdO*2iAH^d_t9~w8+S#mfebyoseKaozzXZfVw&v8ZY0 zEdCtkyzX-Bj)Mn;fzb;NvXjG_*Lh;_ zusvLZFi8lbkFCZ?&qptTRF#u?`TkI8JE+0onh3G>e&mA7hpf4*+`#6nRYttL4JFErjcF_^BU~g4A z^eS`h7!PlKFgO-IzuIAqUHf21fhGQ%U%~_3CJ!l=sDqoOAy0`v7b^G-QvlVg{CT~c z$Ugv$2eeTv3i?eO0`3kQkdHw6?ia_Dp)iphmey+keNMyKx)wTNEbm-wg${>0KH>20 z#l$7wy<^8xI#uV;DMN}z(HBI_cA%CX8qd+S%<{kiz)M3{iQn_Sd4DskrL11e z7Uib(m~vH;F_ldL>`s6%wmiA|fg%112D>L_+eEiXKH6(Avx(SgtIF;U8p~6bqgyd> zD+#P00B{bq@%u$Hwd!Dfg87&q(r4Ye&kvV39^FgiCF7e5Uw@jrR**yVWI=faYBJea zr()`JawDPkQaw7Iye{-=fXeZECEfLBpAv4n*#hh$h~%OfqZ#%jIme*&u|I^#iaRsE zV#4@y24Eq6G;HV)z3|MwVBMN%@xwr8Ibg6k=`N6<@GfbEU$D*O4u2AtETc$HsAu2bC-8T`J8JzTr)OY1o(t0h|> zc&{~MS0xPKA?!jKr(!~{Ypf6xLD8A0;1Z}c{pZ!-(z37z9qs43yf)-}O^xjNIA??$ znkSEIKWql1ux8jYY(Db2F>TZ~qS&vO+n!Y-?_=u`9o4bSAj5 z$Qcd<4pG-d2Dv<*TOSCcV+%|Or6%!;y-k^wLGr*MtwowO6HXeuoGrV(q|_BEli0w? zB8pa+G|F8!7uT(?z4bZbrTgJg{fY=?Q80RuUfCCgK8P`)@09$E`ZVOU;=NT^Nc7;XcE0KQp&He68|_^BzC+x%&`&e zNOna2M6J#Y?MV3C!Q9_2Z3II*^w#LUP%QIZ79pbsbzR2x$)Q*LS>1{m`EwUPmQ%|zM`;W`D zj=b34?!!~+%gNJpVH|5r#}p0G1u(o04G~L4h(mz?!*&e{ozbF-t~W3~^|P6|8V-{x zCrjkKe-5UDkpJUYAw3UwjIcxvm9(~FqOPLG7wn%|vJvdnyKWE(%Tb>KHDYR^tM_ML zzXy^W)<&Gqk3C@cj!!LY17N4l`RFqq-jbKQdxK-NejU9=~rn2Cf^c+-P{l3Ge(}*k|dc!#37yb4A>A`#Xqm!*63ohf59> zo!^MP(F<@H10!yT@yIHr%3qF^_G(!Z`U0mun`DMGjy8!M?Q~3DPmTvyNVf**LOnha zeIKrLSJXtt_8s}b{XJf*+~T!7XtK>~%H!!?wp3AAf5hN^aWpNAO41%Y>%ajVfS?UA z5_-9?@0XGECXa8=`P}~N%l*4|#YvFGH|`R*v=>3aTP7)&!AOpnRXcqzhe`K%xtwQa zvV+fQ5t?HR6mBWuZqZhD$3N3U+(6lc?)Zl-*kS_Ub(&aN6R+fxt{A7e&3;BzIyPV6 z5w_MT-)}7enITVd)7s5?9xnE~2X7%43NNHEJh&gg!4B+kx&@-0V7cVkCNo~)&p;{L zp{ib11I`|x-lLrqvIdFo(olqKoT1@9J@rjNmSD>Qi}BIfRT&Hhg5n&B?@F6Q6_4E@ z%n;udCQ9vOat;Xm()1eenN(Y!CE7zAC<1P_eqR^2&B5X#mgRYn@VUTpac1g$ibuqL85zGdC0xyh_(%b6 zcR@o0$wAH9vV*E|H4`*(YtWY{9+?YQWFfI-v3o*|h)r42a_?c)1L;jUG{Bko8I$gC z(5$EZ>^w5!)rwRVfyKtRWw6;}OY#_-D6# z63KiNzmvt)7h4LjxQmhjmOm>Ly7$YvThOqj{)k}ii&AHB2XY?>?$vk_Q73()xviO{ z(aqT*b>vST99EIlTJCOr0;g8d#13`73kE^`ZDQP3t`Iq-TDl`?zgg(G+VEJ$^Gdc? z;>+^HA=GgzP1@CQ)p7b|WYZ~8@k(`bOnKDE9(3qd$UfMdv?a{fakW)JOjMa?e~Y-o zi?MJP@irM!5}S!@saE*j*zI5+FFpLs?Lt9Cy-$xWHZCGtHLOOX^}}JCFmL@dQCQ9H zMeJ!rJRY-Ve^4Ct;)pD?^{UAz)WcVVyUmd?&(N5q-bF#X_mN`s1NCT;- zw0_Hp+p=$~G6hA;j(_pGksag9YyyD4M45q1qOco!vS-|^b@ z-IFRheY+P%qLzM50#8|}$s$H_Ws;$xP+egQ9L^~3yTfcK4zTP+1ct}uP`2EuQ$vJ! zSlGukf7JM59S+@kk2=p;{4ac5<)$MW?uSbwn{oD%<{iC);6w6T_~QHh9L$U5++C2D zb^F)&O32mOvZgb5WwZ1M2|$2{xi`nlNTCr9{($3X+T(-LUWA`n{+2P77!Z}s$o4b$h16;`5{x=@u*p<1RXe>UDcT%(J%UG`oWs8z(53} zv*qM#n|Gfh&FUR9m%tK8Ra#*`$_Hhmv~|vj0J%xT5|1@JJvU4ET={TP`cL+=a^)>; z|FA*f&z}nn6!bG2YG>zKP+;?xKQ;+?AwCDawyWDZX+{;SPrl=K2O!~a)GJomM+3f& zZ#zPO+yH=EsHM^p^&2djhOC1hy|^q%<@21VRT`0w&(u4hY)8vm#Eu|CDpZ#_SAsV( z91#Ve{)V@f0{W$W$*4KKSBtF{@cdJTN~5`seIxNK62lS&Hold()vXbdG9yb>HV2oh zfMQT&;@)+w0p0k4`Raplszp2orS5ZV8}Z_q87M9nP_x=l4~(2RA&pzt$KB;rZ19lF z-GA^{>q|mmu$kRU$6YxJvCKEXn5*k#L1u7|*MncUDk zd8B5e;X0FnLHrV@{mPV9_2h@DMTUcqUy6q8wG~vOD|A}d9nG?7fr_-^3`M-$HsNQ( z1%i$=CY?0hy37L6q+*v3V7hh0jiAeTZCtEmrCDXN^X;y^qNDNn|;UdNm~r7Qh-ilPc&-|8S`aG_wF^=}U26n3PO@EYhOqrvvlx z7)?PqZa~OL0dzpQXA8`z@8C1sZjaBREqh|Z!^iKvBRj?)PMgwMy*V~AoW!4;*txzj z=wL**+vPl&b5LOMx$#SF| z0aR+S!u-7kac@2qGfIX1!rVM4GM(?k{y{cegC)Je?dAx<0nlIco&~qljH%Qq^%w&1 zsZ{l6`>)S&+<6XKQzfTgPKa<$2SdGv2}^V<8y91;v*154#Yt&HCr0h^R++w(2%YCF zb-zCi+O}ynCySSa34S&K%&{*qO3ti94!Yq~o-ZTd2~qD1r4!cJp5u0;)8AX}yycj> z#rX>Pq`j2rB2<1rju@`jdeexFMm(pTO)VX7~wxmSDB`Mqz7){lJ0M@Vd5j4iT8|6szPF=my! zj%C^|ovyXj1Hz-nR znWbGIf(>LhOC`|_VbC9Bd-2(b>`Yj(hIPBLWI^r@LaQeznou19mBS+=fY|6%-FRP> z?t1enr%!RkECnMiDv@y-ephH70z=yX%VKnF3=Qobu=f-JZ>33(lStZs|@D0cinglmd@;E!Nv!;-a_PCtSYg_{;;p7*KF%n! zIG?$P?mMk+3K*4l-V~|ESC?r60PH@B@=_ATAMl9;5@LN3ANm+JF|Qoh~3Khk|5Z7sXT_{*+TH0CX- zYLI5w&z*|vPuXXkeQmvFFXb6AQHrmcgHIj182vV6gf4m&!6ZF{vDd_YP%oS2EmZm~ z@^(2`^jYrxSpX=F7e$DRO#8*AXqS%PVFNRyY1NnwBvQ9 zMq~zpz}YoUu#@;If^lyWUa)WO`>&4Iav9XAnNO*nSqgmFQ0@H_@Tamiha34_CLOtD zjp5W~m%bM$TJswbYuMr~3`^p=?PA?oE{C_PkC@0rO%P^S4j##xCV$ey%qM#7>9Q#-L zO^k$?mp-#frHbWmEUG+US{EvuoW2H48i)SLvajENqps@o&OrmD4!W`@4B+>qrC{MZY zQtpf(6XYXnV=PH~Xe}i#k1Ib$4i*oZkl^4Ok?05V*d6xj@Fs6);k4?X`)z6!h=unY z)1_>1Dr0kfQHN^aBLUoqNW^RZ5Ki@s z$RmQn-x-YE(64rBEVi`e<@%Sx2P)IDIcyC1@CC2tsj^-a6z)Z?lxy3*KZ~<74oL@f zFO7mG*=VSi4WnX)p`5AG9DKpu@okRaLa4KZvbvaG0%yC#PqXMSJ4tQwj96A^=qmU4 zik(JIS=1dZqWN^{{Q=C|Ia>ARKJozND;c?xT3V%5aS^Y0bnHHEWk)XGLA1$dWs@rf${ay?S$7O0IQ-!|(PCCz!BM0UBTQj3mUcLsak@6Ba&y=q3`Y-*ASX!M zOLW@1tp|C%+Ff68B4(kwWV7tQRZA_Mdub9pJpm==hrFNX`_Kq1QmT)gTcB}bH`btNOb|Y6BnLlD{UBx+DBY|(x$dOD zTH9K4TFPAcRtLQKc#%ABcfP3x2J?m=&xeV{mo&k1opQ$=5}OqJWu}ITI~;E*={P&L zcI#`wS@ukwN6%`K`ha#_D#v@BrSV455F)`#zYp)yg*x{sm8c|aZGJo9bmXXahAV?x zw*V}Ey7ziY&Ja}l&@f?q9c^QZ&9&&Frd*;X;xs^i6!L;$bRgsdmD3@0RH4Wiy7gXf zDZNN&bvU|JsBw2#RBqo!c&512D)40Pu|*n^s~DNk5?_c0BEUP}j_I^DBq81(P(V}!O~eEEdw{g)FgQl z=B-p!L5(&Hmv?6okLrSBS*9dwjpG<&PmnRlbJ&WUkTIezdK?mX`?X#;fdlDxJbV|r zLx(g2!Z(Q;-Rx(jm?8Os;rn=3mSIv3#Ib*k_-!>aW5$RUV%%%1Rk-58NYXj9{gcs1F#9L-&S-hDqmN;nC3NH_;CZR`Y}*%a9rtV6M&)PKzp=1ALi-eJz_>Py!)Vmgi2zKEm>I+uh*J{R5pTK#DK&_>?`_bkSKI* zuTwSKHv=Kyo6AOB3yb^-{jY2};-n8a%4xD#*B`RY9-feK>gIRzWe<2a-yj>FhE5W< z9%w^!QuPgn+dDgxNtawUw@pnyqyG|_`HuU%lv?0xr@Hmm95A@ATr&oJ^WgxKe5MAU zU9~*5%WA|4URyA(e7(>>r`(}?iU}TY76A|&n|;HbSoffdC^jLBY;cbyw2Vpyub8ZT zEk`hQ4(2gXsB<6;J5O2xc$~04r!}?v#RnBDZv;B&NuxSI%}35XpNd>THG*=Hx+A)! zY?Ws@u<#;>)PrSYDIcT~gp4y$N#Nsn<=9V+oAcJcKvbO1SC}f(^mm4T*6yDkGOE^n zirCsOX|-+g-NLrD^7isx5xY4w?2hN?Y1gT;61ccyi>r^q!?mLwAmrC6W-Qeo5N2df zXeH8(Lq9#KE%;0J!1lLGNC&t9VoGIo$m{%suFGcSa;%`Xl7#PQwM)QoHa&Zs@U20p zMz4;6Ct>wk+!N{PA_uckC5&_91uHy9;>nz9zP`mL5z*a>vOBX*``)Occ;Qj(3UYh_ z6fdKfpTOV5&L;^}SDrqQBiG;{M;;55?}&U%1fM*dCFWb+jXB?{I}687ae&i*X9#NnhnH# z2aBFgd-V8x$*;8yBCp*xCtE-jW^QNM8Zfi4{X-}XQ4;ywBc}A7XuQsVV!g9o^De1iPB(#Nv@pzCPlkwrk7K|EPd8NHp6)wKr z!^rHGiUo~|+236&wup7XA@JWOAq+L%oJ*CYJ+IqWhBoMRr?%Q-3j$7i+2~-aPdv}} ze{AuP@;NSpnyCpZjaafPfMvtwI;2Ts<=!385%YG%a3|!Bvlp8e?L75VP5$%!shD0o zmn&DKMF5!+0@i7zn9T3O7k@ine?-JqfZSn*=w={hWZHrR zl$sCKDb-pgrmEOXrOJXPvLk+xiV<$CB?EuO&Z3bb$ond;B(m?ZP33B9iZ{*F&@_If zTR+I$FRZUjqLdVih&4?TQTx06W9Rzfkc1$i(yC$YPzRCdK&7Uer&M8qzqA|_R}IvS z7EJ3q5SN>k+6@$DThaN^v7I@8$mlLxdiMg1@8ix@;%&KlOlDJq^qG-nhPN~FE2y}# zzBA6k{dS&4XVLnxv=_2skN$oyluCjuWpon zd^IGwqBaq~o*M>HMZDD3*GmKBD$SlRN3(MmbXP@O`Q|PyT8AF`0C6ebT+g93wq%0I&!^*?`Lopc`11%-Odu4bZ$NF%LgXpjaF%Ak5K>YUV3r> zYP<6^enbu_xOnSq^;@#W-^~5~SSt1-;>VkwE@SP6PN_ih ziQTr_Sz2^U)Zq^9F2(C^tOOaa%;X{xCyPY@N4ZEt(XyL0>tW|DLi^HQurjma1=$0i zas7wXVkgoqI|Y^~7-&fxOHP3wFJ;<&S5*ZPc%72t$IQel2W4kuzw_fYn;uZm-I0>& zk({7P@lZ*yB=AIe5PjtV1ur|GDqecVYi)jxt=(gZt83|*q4)a`n%l#UWsvw09YeY( zx&C-a9UcGdU5rKR#@X(Ipx5o!O<3D5CCww!tG(jQ(CdC<12I#mI|PE%WQSz2gqA$> zQ0>0k5`v8iPSmr#9VRGD+vjg8$?PMG6=#W|KETPR{wi$KK`BQ;wefhccpFlbtQvdM zT54=p@uMLp3<`XLRM*RvZy(I$tAN5=g<;Uo!#8IkSxIjvNW{#&?lv*!wGSCHMI@fE zR>)Jspt{~0YQ4MqyR;wRsJ=iX#xXNzJ-LSr1O4hNPq8>dzw;kI^}gJ9-)~j55NUK(kWSSNMcWM5L%{OHV+EiqAyY5li)egH;GL(l`<#~P`^~5$PHRvOH zCkJwcPn*Al^muVS=x?nU)ouAEmdE=-`13*2pEnd=Z{Lpzua8YA{IV(#HSqlXE3wfc z!V^EEo64stHMNT~lZYJ*JAkafwoqf@F|sw0Gm|PUh`E_LlBra2 zXJL4`yi$$=MI9isN6P2=o?HEM^r$u!VS|0W2}chnzfdV3(K-6e?=@fT|Q=W2L|_t?idu8*J}2R^OdeW=Lh3KFOOEj zxZM>Vqdt%9*RI>&W)gJVTS%*MJw7k_K^1^W#HT@?6x&gxSf@{^y?a;YM4i+R*nw%?d?|aP=p3EB zq$FxpQeV3xo-hvO)SZymUZF72E7O=+yD@if{}o${Rw;Hvdx!OGb%x!-Ze$<6$wvIg z>c&#FoT7cLd*Rv5VVK%`IL$12u8Jo)~? z)g{AXylC2&5H(&BZim?%JI|`!`34(4o&MnqheWc% zS=qd&D6VY|N7r%cuf2L%_-m21(Ee&u5_eflRqQ+xrV!KXC*?B=P3ShpGS#RtU}#w3 zWiS10NBJ2JarC`6h3emlWw|ugpG;WZaEO#}K9Y3%+L>GI{OEx8a0po86%c=p6_e)* za+@~~Wm;RbS9rVVw!~%0!Dr!V&=55)kFxnI{N~TT%@V6Bcu_6Wy3sE{7E+DIh&LD& zCFJv5mS__kt)ewVdiot~E!hGqxVRr4ufLI5lXz^ZE@(XXQK#IZ*X+<8>zVG$Vfns5z99FuPTEbzynN*Qz3Dh@A1o7)@g5cXIFXc_O}Io z)mSQa`5#N|e}PY~7rb*%ky8HfFi){Ca^Z!r$CZTn=n%1Ua28a+O+n)B@4^=D^p)$#oVFAKm*6-Gm3?(ow+|#)?b!i zt2MP0Mp{A^Q~1!`9=i-3!*%D!X&xEARngE$q)sp|i+S=PLQM3j$_`DCx>!t9s$(hL z30)ZJ-gssjwakK5y$OjJW6I-=vX?tu)vhD_YR;1o?UZ%{6NLb;zm4K;-A7ryM*b2A z#25m9wy0Eb{l>!x*-d-Xr$(~1;wARM9CcT#TT_!3)-UCeH`%XR5oyY;=7Zj0n@k1_ zn+73B1*G2aNT^5mM$rvy@fN+=osU6(I5Y)D$>t~4emA0(mAq5ME3>znRVdwm=wNd^ zdQd+2!16~%*br7Z0uF{}B1l=HjC zb=H8G0H&1TrbutEZQf*h^bLN5DWxno{50`*>tVjTyz8fW&_`yJ`-?+I#sfIx3OIFp zf#rSzr+Y>$*7KcWr3?HL5wwf3NohoAJ`>oL%&8u0a{>+-d+jQG)@cjF4Qm#muQsqV z91EvjYrB`Ci}@)@B?jY_V5N^#HM)zBe3nj#En12V@;cnkS-9XlCgT+D@NRe{RA))y zA=9hP6gVf~NX9pK)M(+AnL0vsJ4&D?66X1OcUI31-6Z~ZjX%={(|$V5q;gS($C!~k zj(it`*ds3Zy_S_=i%@te>F*q7ukXk`&U=!GxroQDQRIKJ6^C72=Fe1qYz^(RuQmvU z_vm?FMgQQEzt4+qz0PSar_DW_Y~#hdE*LL9_ZoG!sGFJXJXbAhb7R zaq5VUO-sl)FdExy)Nlg@ShzT}{VE~?wuS5cr@DB?aTERsP*%~4EYAAB)nCQyQD@2^ z;>bU~3ZtmS+%-fRes;qg!_gVf&D4jZ6IbQ7W7JBv9SHQ6yN{nb;R zj#NL18P%n`e4FOdN50MV$S6v|sp!|8*dblLNiWS+?w(qt`%c>|jZ0m6%j5|c^C6d; z!-v>gYL+|=sI*a+=C3enwyh)=j+Rl@to|5nv@tS?3)n(lJYVmakX)FY8e#uJcal); zPcqnl0ZN^xzj!L2GhpT13(rn8-Y`_q>(3Od|ezO&pcq-g8z@8dMt(D+dBE@}IBBuy2=}PU#qwbC{3WXnv zCJ&%)ga{Q;^MX@MPA-#p_OJ`?K0pma~Q^$$9ZhMRGo zIO|;*FChIkjIDqQMG4Xf4OwhQG$U8!Ui9tX8%)x{?Q+A01@W}*1xy}J>&<7#6%%qvdR@HEK=qw>(>`+mYX97nbz$fcBEbFja4(M}#qi2VdvH)XRJ1s=jJ_uH%~N#Q&Jy z&**e}%p%Xx-cq^BDCrXs=5}`@yodT!Q^Wi;3rU?!%C~&mv$Ni!I~KL<_hs*-HK!eo zNaBx-Y1kcRC$`Q38g!(XbUXXwA>?PZ`!W=R_+)o@qz>{*NE3VHw_gyR1%8nhK9=(V zNfn~d4TX8NoS_=G>7VI3yq(9d;y}|T&e=-va&GXe*r`te7OG{X6YO4*87pYZywQn$ z(VVfVr|!te+pkZhDk;O32S;BB_rwhcePl_D=Pq1>TvdZW_)phSRyjGeQEr2=<1)me zfawEY{sHbtjscUnU_>T+u@QAr>T56Q?fefytqMWi5lhOg)I^807mg{kiN$4~kr{UT zzX`3SfKG2G}zq1Nu(?jL$iit`n1C>JMWi{IXS&*%en+sd+Or(x^gF)k*Zp5&eK zWke}#oq`PS&&BIcS}Xet`-pin*hz13DMo~1Ft&J7>v2tLC8p8735#4g4zDE5*mwR& z@{`H>E@yK1V5e7%N$I8g;TK^E&Sz$midpO>YL4908~veu)Fhn4Z-X%@giD*_UREC! zM_?sscvXt)hzv1w$tD=A$;3#%(miV7QV@l|5j)xt)6dgu6TX}Gdp6a~-d&WD!#!^& z3Bn4@q!+F$s}=iS_Q8FJ)9z!LiQIiUKZaPkk5=}Ize`857p8O5CcKU;xBk40D_l-5 z)|HplL2A2-C3l1*=cOt={Vh;fggu0!w2ZI((~+EtksA8S@7-26%$=D(6>F~h4)Im3 z(nFJIQqER0r4D7ErE$ybxlz_*tWQ=Q@~PknKriCVcQSC)he(-VQeE=09Pq_$$UdAks(9wI71b7ABQVPS1igpBGQiyk3kCnGwca<$G z$6iLyHpG(5BuwUmd*#$cM~lMNpCD3($02D+OLll2=wHZ%%3J_Rm2?AYV^ zjy`0dG=boG+StUfI>L$<$UR#oB#YAVs%x=Mz1GvyCl_|ZK4RqoHwb*bL_Q`fHagWF zSeLld1`P--VWI+$21P|KlcO8F)7dIB8WU}YzkZ<`NdDd6C1YMZ#DfMp>ibWfsw)(_ zpJqG^qZmrY{1(tK!K{`|U$7D*?m3hc&t*(}Mb3V{9j&lx@ukq5OZb!4!I$h#HtojE zCW3M&YR|=wSldPq5(psKK1czs|Su>C>`KyZdg@Wbh|dS<}~pK zNa8MOUZ6n@>RW_`29E)KjgzejXR!p+qxK7Vqu}ftacjmMWigp{uiIxyZO`&H~i!d};d0|^s9i$yk% zKnIO~kjCMkr2uTjp0f7xk2^0tdnFw~@gDH_lSn2$R_#S%ZwaS-xKy(va-C2rS6%+KQMIMydbdeL57{MME;PgSI&H%+m znOdc&$r%-wCT^&lBfdF%0T33i)4$c}H|6cR$mA%p%?)L=Lv|0!A7>CVi*&hu6NgAV zEmd#W{}RjVR}{mdpCs;?=nIsN@)^AA`S$jOe10?cj!jR;b6{w95y~AoSp1~i9L3E3 z3VU_cTPeWyd5dsd*}h6BYiZ&2`eGuVmU~dIsmn@<(&Nm;48CB4QtB4r0&Osii!ZA0 zjm#+li^eC;@y}H+=Vkrhn9kuCE^-;q!vzPHucTXt=@2akEhfdFF zbuoe`^ELm~-&6$__h`)`FBnS>AL+Nj|M^mHv#xHnkATrHIjM~rphW37t8tXe37VFkZt+JIK^ zF`?Oxvi~0=4_VrN_;sv2&#c!$HAFg!lx<48yJ6cu(pK=uQ?MDX9^R^qC= zaEf41R{!k)J;wmr5-)&>~x1T33-LG!YB zKmX)}30r2j!e%G9^; zlAN;7gmOXp0%%(WwoP$S$EDrcu0FFs2N%d}8`~}p{x=H%;V1r=;H%P(o&X0QrNi-_ zw4TSHK{E=i*xFIM(PeI?>@H!w3IS`nL+AYWkowtncsj`nr^VQB)$oS_f3Z!P<_-lt zuNS9JARj9z3jDzjhUddsnt9D$VF@VKs!0iI3c?JDtT*J~Cq5Wrq=&aNm$J|Bl?LX{ zDBr=y^Wu@+gjOWpy>A1~n!4mg_4U$JdrD@p(;)X^jwSoR z=Wa9=a~%N&lOYg1L{btit#Vf)^R83Pj%|ycl6!f4%UEnVQBtJ6q`CycEhBaSpi)Hx zSOrdZTyIV>KAa~DSWXnv7WAmh{&WUeBVmur^TmTSJA#`jA>{JK*;z5pii(Oqv}W*G zmm&BCYTY$9>z(tcD@^};J!H>_Ys8zIyvqG67zT*T0I4E+>SQHm=1xxh@%X&D9mAZy z6h0dbDo!sEO%fB?V&=Ebfo66tBeXtrb|jfRTHCubVot2ry`m*nWi(OSZ@jw=4s~%` zQnDF5$rJXx4yVWKwC1;3&z5A(M4ofXU!XsC{E|-f=Lwrk%0Z!Ko)7UcnkG;^j_Rf# zX;;qh9mp8}R3B=bzo^005`DQ`d9wtw5DF#iIW$u@~l#ri0cJRQ*kMmhFEUIPht3UaICzyqIxB6BI z*O!M?xt&Z4a-C%*kG*(68f5BQUQ^A-{r%+g^P+7q!UilO0+sYdc|e86lJM*yJ31nk zY_@llAC>k(44EjD7*NAOzyH@TeWF=Df2I6su}=fM0GQy0GFqf){I|HIoWuXqvuBL9 zXU!f*F>Hwy&bym^TWs1G(1`nm-b8!>7d8@C2^tFWjb=?GEBEd6KA=&_t> zmhN5)%TZ})=Y3eQL1SX5V+lhZV(9e$FbN8)w;NJ>kS5NN!iBc7a);CX;8 zLf5o#=y84CywxN<H+ z?yaHj+k!`>W29CLW`^V!F3mp#dCUJSGJWx0)8mGz{?JEZlKUH9{y7RPtehsVXO_J2 z7%;zqCHo3KHj{<6FAiK*o+5=lo)2!lEbvFI#F2ve%nZ2tume8L_=;uBCvV^ouzl)W zT|s9w3AfGUiutRmy}3Ylypxq40x2n}R`<{4Qy*4P@hU3Vg+&W)Rn)jI9C`>SDA+c} z3I!81)g2V_$bWJIqeqJ~S#Qj_pns{;L-7pI2$x}TIzI#CT=Ltwx6Y)ACt~kmG9|yy z*w$Go&l)e!M{qp=?BRFl@Bv;HO_Gz#u?(hI3L%$kp4FYRy#>3KUPa1A(+fF6iCE?& zZVCJtA|GOuUtE?OZo6DtorQA=4>1V4V8qA5N`easFXKrdBog3mYFLLd7pgN52pQt@ zJ_wnX$niZSS9t6F*<Uvp+n&<+2q>AEQY<$NfMP8qw80U=k<|fIjH|MrJKYs-?V=#t+FzMadIfsc+a^yCw@1xoAS$z$~5J?N;5@6{oH^aZEx987~wfo^h>;7fGL|&~p)lP5jA0&ZFeGVc&tP&o41h_{ zm-mqU$Kwr(KhJodI3P+vT?M5bd^5|!_>Q@u782?w@F11Yhb3^KnSU`xhJ80134sBv zeCd{`AtV&fomJw3KhX#J8C!%N?644h1F;pYr(7rqShC4*cQJ%ltVTVj4&CNxW>D4H zmqAd+Y4khTv%XVcWxe-CK+m>av=ii3Ud3eQ^8>=#`ac8;dvtJiw(Dkf6{~R zldWt`z&h2UCLv&71=6SY&%?Wr#8sp1v*lBsoliy4>Wcl(0l(yoiW^eInaMhoh2?QJbZ1=2V^ zfqUJrL6q<>a@TwE!rfpYsrG4p1sDg*&cme#WAZ(c!G!wxu!$%2U@w3Vc-S`PBN5hp zv;HLiA-SiJJ?s33{)L4vxNb;R*PZDIgT0Ja5&1qU!$!AfNsR>X@gVj|KG;uws@CqDyb-(y>2si}a&YRGy zNItGzb{PDpxv7^3VdMe*>g912QA!DRdy~=kG=_u622f`V;Kz4QY!33zx~Tp+iZKFb z@=JR%UXkuWN(!qBg*(MJXqf2z>EVFop>K-Ev9l@*puH>vK6J8ORgBIZbb{fYIVBs|3!a{Xo<(Qzg%W?$6H!f z{cD`oTkg%EHylp3T27Gu&AaXW{<-iI66JrsLqXO1RAw(QNc z)$Z;b*8kSStj5p8!I4-|9A)k%v8~ni0^nKL9!4eLM$kvMPTpGIn|QTUN&@ZMXo+ug1IHYI5x^(E9$=t-Z`I32t9yB4Y>@(HV5bpy1t;c z7#c@F7!ax*Y%wLTRV`-IGsI0FF~Wc@t54p;fJ zRa@%idB;l!QkLF@X#c*ax~!Cs>e0CNV{w(|IhmR#_|kofWjQ8zu&-dzf0)fRUfa#5 z(Qf$Nr~cxf+N8!Kn2)nmKXH$)D0mlb#**-&3<^}dgMb&n#$YXJC*||Bn#;WTPnmED z1)u)$;WYabF7yBc<1oiaeuxpe#$3w9`5beGyvTeSYlR_VweoUIEE z(x?0|V326aXMcIqhimp)u@KZp$m~*oWGf|BaqY$z+H?9n}DFh zM`IIC^A6FK>ZYv)_&3Vs;QwTYs-v@yzeH8?aeb$pxO(*VNL$iBJK|%2wBGYp$YV9} zQN)Ko?x--1Gtv~B_T(NBNP~ocMvjoC+dnu5<6R0A_br$Qk;4Kk?_3lUbMr957vHS= z!k^f+Q0;p>mw~Fa>{Fkotwsk08!O|jH5M*HNbE2|P~#1b4->NwM#-cI*)ca@uD-$o z_LPz4xn({K`p z){%j{0piNNrku8(s(*2if~Zc@;M=8nD@%d#T(vh2@c}U~RWS5n_X{AgsdAsE3meq? zlRK+Zjsd(+If6WPn46Q;NQq-be4NsQ|3J9`Hc*{TYBsK#M=)Iy$06Kem*aIJSGu`< z(@?fr@Sh#`1)+{{=73gDmBFm+OUfQM=;1zP_~#~-un#pZ_Ntlk4IN4an*V7U5`5zb zT})B@K-DL!i_-lLD1;fq>CB5&-run3+~))Ni)<~4p=SWe$)ll@2rdXO5>wHuWut@y z!4wSo?i%mp!ub~iL+<2i%-5SIkbe@9f!3QWz`njIGXC}1s{h9xSV(?v!c?Fp?7U56 z0;WL_8bryjU+ErG^za%Ue`@RSxc+CwzR~X=yQmD@Mx7%c?@Xhi^&e2eAXYkq4ycA< zIAB@0_8;j_7KmQ;?}Vx%gdIUw_lG{6^S;B$#@|q+HD|2fShA|{)4Vw$WoQd^$v3oK z=lg*$A%s8z5!5Rwk{g{l7vP972MQyS?w^CPZ)h)g!N6vJrWE7tPfsa?Js0JnqKqZ` zv#Gh&ZQN3$pV!;!f~#yPCX5a$&oF+s@I7 zsABlYev9vREYC#AEjp-@3iX=Pfh^GA<+n2a5M`B4s$M z4?`NQN^C(BYhb7+r-i|SHNa`OFo^c;*>l&~)Sg!aDq~b)v8cz^!f-up4| zcFL9PCAWJo!k9Im8Vb)*_^%sLj^zI^->c1Y_@Adr0udk1|a<>2$nyE16 z^q@&B8Mc8U3m(nuV>YYJmmQ=|#`oYleU8wZ#Jg ztyFQeI2c1~Ec2apLVT=t|BOPo__62RZBiQwH3CA^uU`Y<5m8Oxa+v%crA4$Xq_#7P zjtC_TkA{`_#HC3K&hgXC>#v?5IT}l;oL~s0(C&VopyKul7vRxkHv!d8{mD%77T2T57k` zZRU5BPq4n5n0UX}@*<9pk3W7?X!7HkI}$6|=iT|WzI)-Jr1RPJpv|9KY$KR2ldtle zb3M8jamXSC5?7gVmW&O)fAHZ7I^pon`oYbAq{mM6KzH1mYax(E+dAG0RToO8o-0W% zB(H66YVvOT4<0kJg8V(>Ey52Ms6^n>g|n#hD8=bK;*SXhd3rm#ejBsF?`)RMkt`m_ zQvY$Kx~Wbi;SwXcoj|?2^!KJqPkFUy=6M;+JU|w;5mzM>hXCwh(_LDP_HApsxc{Hd zw>mfND>$^6OQ7Ga%Ka#IWl-8#JXttcMcijk7~AprVt{OVoyJbtBS~l0ioM#u0^b)V zKiz&9v2NIOw4yRuUb-*EyuBi{w=orH*H7)1E$2HXYV8j&eg(CVqC~}H^P#Y!^Sg`=Vec3vBZ4R^G zAH204cef_kZ1W$9+~KE2uMv-%HtFYURZUc+8DCdfyFY_zSSdCkpVgmzcC_D8eFtocBx=ZS#X75PYg|6cot@_vX$>_dBj2yb_g0ljsuNmicrAiX9DJ z^Lf_`rLV(J;e2`-_b(mn{b&WxoG z*KyK=i7q9l&-!|mrt2?K;Rq8oQ;@Y9>G&HXgWRcI9YJ&Zhm;66w0As*rY>Le0#{p~s9x9Oh$F_T93;~#bI==Ya8!b0cX4X74J zuw{Ky?CR?oe>C2>%z5v~pc%7nJ73FucWH9dh3cuJK{^fw3V*0G2}ilUz8JB4o5H;n zvrlQ?Q)RlrFTeD5OZ$)rOM-2mZ@ArpDOI%tWd(H9uTpFf|S|8M!rleau%iR8jAow0*y`V&O@SpX^AiFi{mKKc6dTB76^{yljp ztpRz$tDbkmye+ZMOP9mq%Jv2?r~u)A7ck8PRjckSMOvb~rej{TmDOt>pa7T%d$M}% zljhFjSijfjd1RZT3ngaqT7rWAqmQD_?r$PN0jIS*XOf#ts8^Op~JyoX0|7u`5pwzv544HTYToA(NT ziV|?z^nAr%lnQR=ZaNnWhPGvgLI2M(i2gWm{a#-o+G*L zpCZJK7tm}JWrlah7e2C@(ChsJ&Xh(F1(>h0f!y>W4)lZoxHtqvP{J5-91~Vqy~mp^ zVW98}f!Lc7D0c;opo2ObAoA-^chQ#kHHlPQa4VT{iHf8!J6X^u0JM3EmCyCr%ev6S)IGLF1 zL!#kd-oT7(c$N1nts71ADt>Ujqx`m(Xpk5`IJW=q{VKz?yem7lhwH~SD}s??dy}7A zdb?@AdiMBEWU;j)3>1Xu)9k;YX9ky`!i*9~kN_XftxZr4%bH>%9lue{GH;>b6Qz_7h`<@gOsY(#cwR}nwm58N2l~Dx zg<&J_7Lc&C9-GYom3PbLFIU|1U3&Nd=T`1l`CguKT^!Oge!^?1O4G^by1$>3_OXXf zHlq0tXiiXNxW+=2c-vG~=jj(g<9{nMXoVeY+$(drxc7}HZ$awrE zP$@Cu4UtF@BI$H$``!7yvcqC&yX9WL;owD3nd615A((&s8he!bw?@Y<`sZ{+DiIyP zSIsDk^(WpD$2K@$yj&pAf3vjHXmz0htjZnhemsfN&*9_Zwd=-&pNKHB%YFmCjb5Di z1P0Sx;5|G%#@qK68XH4A!I?DuyVQ^MpRaiLazLP^J(p0$4g#%>Q4jm|U=$%Ts38K0 z9ZyhkV|M+sCd5^v@nh^jxXgmi&0j^R=KUvOf02mC>|PYRR5?t&t#W5C8ESoZazZ*p z_wqX9Z^9>H;8lkn4YDlMA;}9ker)id8?vJ=|9NpKlIL1|f~6Vuc1XuF9$xsZFpKqXW*<*XwRrc=b z$WnEi5`$eqF88VRyk=jenK;2SvU`o9TAZ@p|7mJh!Bug-38vet{TkOUF%wGp0)=)E zagZ3F+v7|f`OVO})Yx|9vK|xGUYG}>ZPADiy_ifNBCfH5zM4T`*)Je_E{{Ntx6a@L zD^w7N9xeC8hFRM=Q>Fq z1wB(gj+o_5JgO=yl21GRV|leX*20paq16t~gNN9M@=cn7^bIV-4`7q~>(Z=T^Of6J zyi2)dRL-W)_MBbLsN|lZ5d0_|9tzXPx~Am~rXEErhx18Qp@_c#Tw1C&2sK6of+I2* zqDsYuU(>E~zhYEZ#vX03>FSLsAW0P3qI%!)6uNaSosFPs>3;4TX}6<5+_UJ}L!zY^ z891th)-S)8r~%C^7$FR>7s0_-Luoc^@&Q_3319N)`*-Tg-{&09G%N&VQ%~1t-wiiA^C1%C09ni|qMppltyu zgk*d{e<_kXv;6|_=36vG8FZ^LdOsFC&Wn8crK8{8R^36mtDN6tI;4|v*N02=L>eh{y%FTRQ zxG>M##j>y?!0UFl^16(w z%Roiz;O9Qqt8*>f#1_D*a6Tm8H{|voDb&-FXV~|S5TMchnf3Hsk(>1^ei;2A5D~;g z65f+}yL8hxbg$gNxeT|?`>Mj-!dnnSNdJL?p+p4G*GbM^3?8w7j4PIZxnLDM95X03<0a;!rR}$pJsNa98q97{jAo4^jNdb9OE* z9`ipfId&l!IDK$G+7l0JM}N6cT#D8BRLI2#i35?#Kmkdwk9HL1niVH8WL^2rQo*6l zma~TPg}!prpP`?CW!UuJq-Agt3`+kWTUP;9WxK3p6H+4G3eq5<(!EinyF)_h?vUPM zfOL0vcS|TrcXvrkNjLZ1=zq^S_uRGq#lrOid++y~cjlRAo|#dj6Z@v!#X)T zVvud2f+~jyeJ&uX${*vYJ27px?(7lJt8=bVt;+^ETBFlNSS{a&F%ZX|;ZIbZh!&xM za-sVGUEI4jpc_P|!S+XwowCDf2jgGk#a3X1#XEDZpDh@gB=1gC4Sp@_I+aYm|8Pz< zk1GTCJ0B|K;0MA3O%B7?zV)w~acFX5tk1@nHJ(~rR)RSgXmX1RtB^}dfBj3*Ri`xQ zZMgCAv4ltF>g3zA_Nhhg>Fw3Ln=Iho19u#{U{A`~Zbv*Fb+E)H93!H#O<3!o|R zx*|>CKvV+m!HHmx&$TP>WwW^xpOvijJA|Uop2rKFO|x!!7AJ#j@E6&nhw^JC@vaY7 zIGkka9Ngy_Hi6~<@3b}3$S8DeF@z}raQKL7%i1!|$=tS9jfEr7z+`%6Y74o|uYb~R z%H-IWT)Zm@NCDGH-m?+v9VKMFhy&B5o!Aa-VGp17!3TFY$13HgLPmcbN0%%u4ZU8s zqqxa534;J%QUJBwx&Sqh=)8$?&zLYUUR{2w_fg~B({16wROiOboKWyn@LS6Gq_DZY z4E(9l!idVZ4ab(&Q|Zn(Uv>JEg`*Ccl~lwR><2y&v85Eqxewr$VSKsSv;G;a8r*Av zNiue@S=BAsz+O9EqAhgSeP*AwDoncTrwQ7H1Ztc7vJwS4yix`y8|5f5taONcd>T%gBC-m#RB*C_rq0QzcI9*yhwai@M3WTx7Oii=bNf@ z$SJw>G1G&j#BFVE>+$-sB&#|@c1j$0@eI~*lDswH=`r1xA)nkRQ_T;iXkgmzYBy;0 z8un9xAAti@y@efLcMNqF-mRod7QMd2x_G`S^QDRM=Vwy+^F(k-T}N!oPx|{<6i0lH zhW!CVox-{bWnYd!k4NoFH?hsf67%tjgSr6k#{G2pZ!&&n;s|yk>TM{Zy4l^ zZRZ5X=gwUbsPCG}m<{G<)$GzYM_Wl2P;nHFOl7@(RB^eqNu&h?fD&vA?&tf}I^s2! z!uwYjCx#{VR%7O=?s^pvxXArDcPwnh_tLJKWqg&Ehv0=;-BTWv~A- z$Uv(SYaGhTY6zSV{v$Z*SCC5!=Cl~=Nft~a|q zvgk1d?(?gb>5g`&D~9ibGCr@>6vx>p?PmCNm4(Yn7l9;E{)Eb0qjf@VLq>Gae6>q^ zSCGY-5kB&4p>l)0a{H8Aka@g86U+zY6E=2H3+NjA6r6m_w|>I$!Rc|XN$?Tbk@GJ_ zFjI5z8227LJG=u!f?2t{6i-zi4-U7ZN&06nRda4maaWeiwr~h%N7H62&JEVz>o)FB z_&%N@>@C8zk}9$I240JkV8V7>NiUhVOHmeAo+R|OJ_F#AAp~4^4|};f=L5Il;35rL z=@`}`_4^kRPwv+t#Tru*so?`vub~=<2OWha0_XP-2;hV-Z4-}4U*cj-_(YkTLsgfv zl;!<=Imworp$|v=Tq%1;l8lI1M9liHn&k*CrlKrVsVx{EZvSt+p%H!F)HyGm{ z{jY?a&XqcfKIK)GzTvf~seS<3mBXbT-in{^AUs7s)x(-7^*pfC+f^a8d1I+HS{$tt zMOxVrR~9D8#A^pK8&EEoYQF9ol_p|Kx?k&heC@kTln}`=y?Vl;=PRk znXkZOxg+e--jw+P0-8+hlh;6YkBbfp<~3I(u$P?3)w|PGuP?GAow-6QC)?@`XQ!&G zEY$L?)pfPCr8)WEsew?$h$is%JlvRQpd-9L&j%=Vl)J=0w}`fi&Yj^$Gwc(mXsTc8%ir%mKIZL#l{9!1zA}vZPxj+2_fWQUTV6F)a=t|&+<%l zKImqD=2HEvsgV_VZzE22;YWZ8Or3S;82K8(WbVt`D5(>(!OyH#cHJ?bLp|iJCL67m zBkc;YC@3h{*|(Tl1(tCEete~&$N=DsCI~N-6;1o=+(;@K`k*uuK}Xv&(rmlap=_u>>-W7&gV? zKqIt*Z(<)$xuf`q`<_G5AiHy(wGO+#dl&AG+srkRI|g{=O!~DQkx$i_YY($7!g4s+ zME>Z$v6EVg++C_cSG2*+&gm(K^%;5`5ukZ%~8cuBH>+n3-Pp6U^zM47LDGGkVDw-Opls?T(UZw4zbbv*ecs3lmnEbGws_Yya zi+(`0fb6_8!)yKO8_}8x);S<4dQ6~&N{~t)hEGC|l{(|2Y(J+4+8fwBS#<%59d!)Ui|(`OpfgsdFp@3}We(rn>W z4d_acJ3X_Ut)Pyw8q&Pm9a#*WHjXZ3veNt_I!F~gH5KdBVu_D4pmg}4fg>iH4iWK1 zbvG=Ema8VFO~!LyRP2MWo8t`-u4se7PMCG)`$ndlo&Y6s^??NSGvPOrgOR*tcjx8h zGVS9_&B#YCJ2PMf=zh`tteai$_TUF(9H!V2dq+oAH8ua}hyK$^mRir&ThHP?v}$(! zot~VWOvNPjpc&bGG@k$m=h0U{Si7EWz4TGEgMr?pE#CDNr^AKOgrQ=bSZ*7z0tREq z`7W}|J+m&D*PK9oeccAx8!MJx%gOS_pkU5{c40mO@5w=d-)LXlZ$#+T`#_G|?s;~8 zru<4~>!N0GmnviK{fB0^-&9M35k!kPc}35@xx_YIS=}}8Bh7C1|55_7jKF>bnZ?+SnruNj)NfM}YGBuQBZ$Ozf_Bh*Dm6lfi{XirL zTfqi4UXo#^5M#|QW=T3(}v~6zCQN7y;Jq3E%S?k42^957h(A3N16=d79JGo(3_mbQVeXKejNlFd08R zGNSU_8XfvRqRhyW)g|`guCE~8YK^p0@6B~iM#OZ!E2DIG_+f+XYYfJc=Uh<)fc+EW z4%sPT!~3};bF=q&u})lDmBn~hOgoD8A2llM5G1i8=U+lW zR-)i@rwDDl&cQ?tXZDhWm>8(E_6gSkr&3c>lNk*;lSK^9YD|v%lJI^%3m=ZOUhifA z#rB-1tzMUOC`dxE!+iY1%6vVqzy_l!)$d(-RBp5IPz<0u;i+f0%hn0YydVit&=LT3 z1lXzCO(@4ld*R)`;2nX(pq$+^UZO`! zKQrZ+C}8jLEG{%N9O88iHo1}KOa}rrrD{3M6#rJpxgr&1rmr`jCqiwhwHvT>!fWWB9pZa#Br{2QL0+N`oZt1}=_P_TYdHdoKM}!o{*K7+fMq;nKTg)5IlH z%2x`e)v8K#3*NjXGpa&3`or68qFy$xka_G5A~rkZY()3pD7BnWoDO92vEgVzV1cLu zEP7S*H6Bs6-7z@Ab$WQN+vo)umXq(Nl9-5~Lwvv_HQ1 zy<0OZE&B9kLJ(fD3ZyY(& z>>sRUh|C;7n2{n}<1zm3U9n!8cGJT2qp;c)=`jp{Y=x%?06-W?MKax7M3+1R8qlp# z9FDckUW>{ta%p_bgi=Q&nU=P;kFxBwPAA9HvFl&z-L{w^il{0Dmg2%ABkDG~!xNNO zk5i`uLmO&@ysw!*A7_NY>nebEF(c3>)l4UN%%s|qP7rt5k8t~zLeuYnQ7x{V>*&^J zN}1o#P;=9LmZL+&qD_CG&+O+dJ^o&`v0o)RjA=UYp5I?V7xIq27pZ%;t3WOCwslYz zo_osbX*gm)-VhFqjjbk75D~RgP(|0=@zj$$m?gRR&=LR?svQkVk!XVF;&dLE9x$-{fe;!NDyF6D3~;Pq@P^(}bV$W>R6rSA)~^h6KY?rT zO>_son5KRE@fp9>{+rZWmS<`kAG(RoFdnNFc2H2sF+oKis}~z?Agzz2;($I=R=5fk zVCA8Z%t?;y!-c_C@&05149{aK0cK`_t)qnTU7;^T>N%hlD>n!_R;YPze-pG5_1GlN z7eYl2-Wh|jpDenS#IvrsvJ`_je-a^|JG1+o#q*(FG)`CbEAE$jgTvtMv$eLauN_T~ z@6G`-4}lFi-v{~(qFG_cIWW%MV9_q+drE2`Zh$SW&mX~eV^cWQeL&Q9Z$S$sD*J^9o^qK|=cerilmq%Nl`wHN` zLG6GpTz$GkO$9lf!fzx0qHhBHgrQUcwul3GMVg~7k7N@ zk$`^GY%m(Rc15}G@uOj_eg{NqV`Jmh)zxP#y7w>&Ue!IDe@v?R4Sv5y%3dy}A5yP> z8$T^61DNo1Z_#09t>pJ_j&l4HS~+QD<n{_4V}b$+^Jtcl4<;BhOjuX?Ay8c9dv$R7+2 z9ika_hQH(vVZF-SeYOmz!{E6hs2X91+_66UexQ|>1)iWen#bws*Bkpg7I^n!ONm9K zAeEBFQB10=x|bDd^OHUYj4Pt`Zm5Ri9deHshC{M92VPp6B8YSVPGVIUXz#6 zgXXvVhFgK17})u)BT!l)OOFC&K^^;%mUQ#ddXxo~aRnh}+PMc;f*l{gTWL zd)c%jaUd)ZvFPutB7|!|B&cnVzwru@iAB5Kt7`);0Bpf#9N$#nfxr8I6YTgYL4j6w zI7dEx>tigLPL;W8)0x*=zkdAR3g^E*Eu0htm2^tz3n~xp0@Z>xa0?}UQYSck*VtZw zDLhO%=_*ojVc@kv!DM{Ei`xd7zNkP>)}2L`rG)zKL|)7yn@)Xog5Ya6G83Zc+ops+ zf8CG^VXERW71O)u<%=u+a4&IDv5&*Mxqj=hv8-17vZPO0W$~tHq?DB-jw9sRa|)k1 znvI(c?NyS|6>KVCfqbVP-qjd@fcEf*(`^7!bm;%{+N${7LzkSrVA0EyS zI3bxzytk3*jpqcv%7}(NGE?U;IT>Mc1)9*zz@#)Kq;pKN*r72hBk9PU@kW=lcdG~L z%uCa&yswul225J-V*V|Y{rQVWc&R#DtVp_neu1bVpADPMSiiQb5WUDeLs3$SB_;)Y zYcxkibar-GT3K0GnSUD26&DvrdV84sMT%^^@_A(3&+JNzNwM?G4j@A9&;|pe(g$(l zU1)Ym6`MpCW*b}rpR23K{1voNBEs1e1ddmLeFM9!z&ma}i?I2@kc5}3T0od!Yzsmo zXgEcWi*s1R`9}uz*Vg~MT}~pNMVIiS(2r7;RbaxyI~AnD!opKz<}X)rSeyRH%t{EY zjNnh}FGrSrk|USQ!|G&C1-u_VlPQ;J;97r*ux1~vh57)s9Y%J~*Ni7UVDahS%d|<4 zea4hO-J-j(O$2i071}7KiC8f2g9MpQOssj0fOzmbV!R5J>;bA&X(dDQq8ZAEUX0!( zo`_A{NPkdg0NPOw7Y07AkDPZ3)SsXuBV9%qzxy<^@s>g*d1Ut zftBXb)rE_s$!UVH(|WSRxk8QgY!DfDdE3L2qn(v8-+y{P{^RijUX=yv`=~OX_80ZN zC~(W;A>XT5~f__yK4srBJ>bP3OF_6rS%Bj~^)dpfgp4o0lwqg9#syafIbj<~S90D{Nf z31a^3G+Lk@0^p=N!nGv(mNBKe`K>4o1y1K+mOML0Mfi=j{_7X$rIDPvI^Nd)?F7AK zsRk(O*|X1=4VbL}mje>$TAe`fpG)wMOM>4AFeG-a{j@OFxWyuqGN=dknr73q5Pt?{B0pqJaIM}kA{ zAE*QRs0FzIYPYqK#EpEHf`+ofPs$C#qBBm#erunoa53DScW^VKMOsI2;pzqYRc@w1 zkV*K71(8}!l!^@PlGVK1xS*-7R9$U_hQNaiJVgB4k#&edD1^KOmOUe#YMr;IKm~UT z8`uvQ7uN?2BCq{rf10&NyVMUoQL^6>JOjTA3KUH?<0EJ6XAqtEYT{kqdgF-q1RN2; zC!4=kW-A@HK|~$1ng3p#RismYe^b!7!!J3hp+fXl$9sFLpsyicU977dEv|T3S#>z2 zniqY52a#p(NGp+&v(=VIwN@(*BoZXKfS|26ufZW?%Yox zjuTj^j()Szi9&jFC7wVG>ZQ?Jnl!*WfXZE#zlcL73O+4hdW9y-ArhFJ zaQoSzDB#>E5lw4B&s%82o!MpNyLYAY7j3SWb)PaVAgw!KB^X0Hb;a+~T}}CS2mjT@ z{^!Z%`nOzX+->&0>ErWd^CkIkh>*sj*Yqmq_q52$jJQR9pizqz&%ec&mO~nlrr!) z3`VYvpSs51fnMOg7k{aR&!p7FbVi2Tq|qo)zma4BjTvsb>+vY>`!B+x^duz5gZ1q2 zwR!p3diR8E!T}^-{*&GEA-PjGa0}2^xufjkMI^g%fk?1r{6q#90ZBy`On_QUcppZq z`vT!^B+H&Eef)4P#0*7g=n)yxwW1Fl z;vxA|2wP^k(~l^JXa%W`*=Y|qrv5A^i(R#j9*;fK{`CoB=T0!-DCMJ&-V>v0OQJ)g z6|hw7kCTR4A(?V#?satN4Wp$~T_c5oD~Uy|*DR^=IzG8H#yB2U7G_s(xUT!5nzEGP zm1tK1dOTt1GZU0@7wKiL!IAAEE)LUM7ZIXnLL1n;0 zqxlg#QGZmVzL{d+DzU^|`@7wlr2rNy)z8BC-<>Qb+^wcKLHs*fAln)_3%Kq}dbK8> zWM!e_PkTFx;E%2cQ3Q^Sg+dh;~8d#?#r?sM;il6 z;NVPU*s;Bko4toeAd+xRYP7QE*C6e$91T9lthG1mBAZsV#E$;J09Z4VJ1D+# zAL^m&fa2jIF)}{Da=bax7fetO-QC@ml{$-x@)BJBXt?|Cll;T1Ih1&<1%({1R=%U6 zZZ&;g3zfX7!mwe#P6Nae1~Gs#@qY$t0jPY4hHAb4?b|ot13lk8>yjm_0fnor78qjqJ_N!x z_{IxBoQ_;qe8m3>6~G`>qya5EK7G7nXvL)0^d;z?*z;KNb4a}L6EGBgeciOv2+TNb z;qW~~qvA%o7D>&JZUEFmWo-#a$O?(Q;=$0sJS(FX*0di!MDi)({4~rOb+NQc3s1ei z4+9u_?!Na&+Xz2G>%Ij&f$0AZ5GPOV9LvrN88fa4@sT^^_H%+;Lui#f12Vg z@L+-ikD<;g&+-kXC?31M5zd71oH?3+Z!gKW$bb_>H|nD8)yzs~p&gp>%^b$G>{5+o zx1-ZZK4i)SuLZi@)AqP01pSCi{)>C5B*^n-=XV7?b5>=P$}lF)ncz7lv&ukVHK%@z za;aw>WhIAPfA;()E zW1`5%s$9Kuyc6Ro0xooP?jU8t708TfaOzYc7dn`ZhQX}(13+$d)1%x#yj+z$r5~ROeBWJLcCR}vxA=ge zjs&H(Ur_&_1N8CnLvHfCX|lU89)kgXd$C(TJFG0#>DQQLJVI2dSGzF)Ua>UA?^>EM z_OwY9%#1RpdwoWy%l#c5u^)h;#b$Vj(XQ*?ariRhP{1OXejQv-dZWRJM{4|l@n zss~k&7}?TRj=`|1&Q;t2e8}gZpg>bGk!WVkOlHxMNmD{tn>Xq2dE8Yk4K4f}Uc^Y< zYD2S)24mfU!Z&Y!f%Kj>CXkFTU%b6|NG~`M*;D1U}U|T zHCDrwgLtZ`I;Ma7NdL{nfp0^zTI!3)=GsIna6zRP=yXZ&o^Wzz?T$2l`9c#jyp%`p z;ko`OAu#G=0dNN1IHgs^l{T$?AW?YaVly1+JG7B#3{{*kwAdr=AHFuxlzwOP ziDP%UUVn27OhhdE7nbXgT(}6?#Q`BVtP1cT1tTEdIm@1xI!s*Tg2wUAIpmEg;mRkQ^$4g;%#aYG#f&o>rot&*7%OkC&^M(ROyQM!PaS zISEI;pdhf!ft;9__)x&Lr96p$OWNJ=pH{PfeDpsE;vN#yXo;BorUYL&xKZBT-dY$h z?wx1Nu)cr)zS43sybAWvajwFP|F-GOPf>sC(4WkiX6uS-;FWsS3Zv8pZsqJY5-E;Vy!-v@}Q_1~bGr2WX$3q$%gf8MFr*32M~Y9UVT6_}c>e zf3XRrGc1a19E=$qAtJ^6N{$%!B|A$x#@W`E$8PC;W~H<`Vn`C-U~aLm47&Iu@CKET z=t9VC0N7pH%JbNcSd;T6Jtrqy+^72^qPzpyAlT#gHeO%|S)UNTFAkuL?DJ5%956-n z@I}PEa^qepJTy}WM1vRSXPg-=%l%&<)Oarp;_X;u&CFBiDffo#gl(pRwvAjhz%so| zV8s{29irKxac9A*-!zk+F_OSL6hPZU7;Td+spissoX296&z^<%K20Uvm}LJk5&)tj z;NZ&O4Z*(y0*XbTGZ0$oXu}77pFe*N4jvmGHef`>XI|Lc1eMHszz0ff&X&8o1I4Hz zr&RhvQ!3w|$p4?ngA)6coT$cfRw_D{MZ20+ryf+CZWy4kz-JM>9EmmF3(2e{2Q$ak z=h83)ueeMr=~;HVNh{utkC~+Pugb?+nTCe^FCH^AhJi8eH2k=c%+_FN4KyB~*M3=I zmmHCaT|g<{3?J<3^0u0COSu`S}5=d<}W$WO-)Nk3WW@f&|Z_d*LZ&udagQUC1UiW$mc@>E7!s zsneO_p79#(8u-Nf1;tl7i0#5Nc&ON4-x0dF+(VdBDnw$KTKa*8bfW)tHfc#I_2oMP z2DQ{$LeOP}bh7(?r7>xTF5v@B!fdZMMY3n#Odb2l-26~@PhZ@@unz*_2xfL)0KTLNTeCa2F}5>nBjFH|jl(DL6oz_X z%0DqZDcuJ}i8-7SqkbwMhH-zbCCqo)`O_Hdz*z0DbF@w-iK&Pm4iqY{4@SsOMN@cR zDmF-3av{_;$ndM}%+ysSe82r{dX&)kuSBVKs1VfH7f`m|1+YaklO(oP@bYQX&GiLH zFZ60{MaGSWt63g8k*YB0uwGJ6`2KS+|8_*MmR|mv?7pNlx_D;IaxSZ>T-kRFIM_;2 z#+3Al0w(d|v%5b#0wdn? z);eWIzGT5DDW8daEq8Za<~k#zdL#OLv_A^4Bp{^GyBJSK>J;_%sdb$hy|rlMlF z8e3eK5)?u1k#W6hbgSA?aoL$odSal|AAF;0`uP$&5do$_WqUpJ!8ti;uGzcUviEc9 z!a|(T&6#lQ8O~CBh_Kb+_Z|{<*<^Mdezy=2U2g07tWC?@29z%LyS}sA+IJK7&P(6J^kyjL@A#b zckx(-$R+Waar zGKU5ct51^+QWNBTr;c8+*MXfQzrcP)^N$JDR$=id;bx)K7S8|NM5rSM(kCSNy5N}S z*zxL=lE?@MX?ftwuCPa!8)u^LA-5$5*f@pwF({u_wJS1B6_q?GNtlpoAd1@7*E zVaG`1caamEd;XEGx25MA^V1zjH50g&MPB8F7zoB?}hmOcnkf{!GA>Z{QohpaCPcS8noIVOc6%C4{jP17DG&jZF;P9r?wFzI z$+W}3E}CsP%oqt4iHE6CqZphun*JIqx=$fA()+qM`zWegl_TrDDP~_>+U*;Kdy{|p zF+rHJiF`=P@7;(aBOBoJ4$n(TRv_<~Iq!{_i3d{9sz)p3*ekFgfmg|H>DQgDJlT;> zk(~RPubNA4yhdyp>CPI{`)a~_Ih3|QNHAZF4sV}kxt(@K!r)5%vjrEa%6P0EWJIg?9Mk-{TIm7bc zfq|VIlBHN`^T~qjbhT*=`-p}9mCdH9HZ#-l4mXWMn@nq4&`6&Ym(5%gXZk$06*ZWH z3G9@a$o1yB(Ocg~&&M11?i@;JKKdy#ibOXcosT@E2p*d$#ls>b%}_ccXkDD`(TgOo zwPITGpvjkRtz~8}A=4+PJ9EwB4L7ByAlC70m(xn!thg_94=GX!@IC;G+oDk=-u8q= zw3!n<<{mQT_U~m&m1Q4rFE2WyZ}kK307T+YV|{iCS`3jEE@Y`lhO*bKD6>;?i9D|p z0xG|$j+W?o_a_Sig=KwRcUALOgHQ6Lzf<1Ws?{=Q!wvCY{tEtG*-PjqF*)z=7%eM+ zOR~H&z^zk*={CA;&o*$=(1>z()w7TRN$ZMYjz}d|S0_%XF04$Mo>{Z;75~l7gZj&f z%n#K+AO?)DLU|)BYOWhIBVqA#ORC-0Gflj9??dnf6@q{Q|0+5tgYH7Y{HqCz z-ZN~O4K`lH{Dc=wcPB4nB`7zL-H2UXYq_p_M2P$ z0P6rJhD{#9DI$M0B?qy-6qjjhD<;<(Z6NNU_=cJ&_Z zXM78#{?_*|Hs1Z$k)eLiY`JdrmWvlu#u)c5cO_$ zISIdf?$bgKo0MLaPvU;j{Rj<3aLQ${nm8n-*>$kEz5mR9EzCrCtvhO*(}3>hRfcVe z_K$q9l_pxo7nDD9hp+8yy}^5-e|v%dSVe^s_x8zw;BG77H$o9=5<$?TO$i)4PBDY^ zOl?Yn0p;2u1_xtr%Uqw=ReF@!aAuSgGN7zJ+dgwFWreu_U4#tFUKBZ~lP6oyH+$BU z_?w|jWdNLd2i8xp7sn-XarhZXEbWXo9PVX&xhXSfuUuo+t}b+rxDCz=0mu^Ur*HVM z0Ojd`DiIHJCO%6G!y?&y@7^QLTph6ao=SCwvCmmb)r!;|Ruh?;82!2TkO$SoHroH%7Z8@1vCb4sGb4}26B{6GIkabPGr5;Jm>J|H& zONQ;lU*5_lx#LhP>G!|mFicJ!&d_^13KGdDsAqX(fx>k@7H_%neK?l27_OGTY3z=( znX6N+`OyA-(EtoIB~nGSgD;)DgK&U4C2Q&`>YBn-txI%eFe3=&&f`Vc@e+0AN!?|N}ul&`b5w9@?~hVsE&qc;Vc z^~{r#As{9|74>iF^K{SdxsvCqoe{vJmrG>e<)uG+17HQrgWT;T!SlBnFPQL(4fWwb zyv70-`Xz=0&SQ{|sXo6dX}&(czSw1PYb!IefB5hrkm;$;-|cpw{_^qY#%W_}!mvf_ zzoJ4*7ZUj24R#mV@s4TFKe!#Pr*Wb6&^^&_xl>n8L$K{cef@TMphMCCAU~^Wv?tgQ zLe^5N6RiHPo>dW%+FbN+Lx!tkaZ?&?<)&Foj!P;NjQ#LvB%XNoWCDo0G3z(%sSnDE zHgB(IdNCdJQQ;>DW5eTdTZ$E6zg7lP0V!nr6qolo`}a}(N<$n&6A@aiof(yQ`1!h7 ziRji+ar}T@}lJ_^AaED&~kxgBL`aM`0Kcsz||Oebb|ZV;J#}O;%ShJ0UK~q zdp{sbpL6<4y}Q}F-Y#zt2=Lql2xja->oR1LiU1dghDWE8@W-));ZXKcK64`n;CK}~ zSEFocwKu_uy7NU}&r=$MSmr0LdkX-9&DA-GyL|T_roX_V;4xpYjgW_m-QN#W|!S>0-)EQB4zh+ z)f0Fc2(85MINkdi`H3~9(bc*y{49l^EYDKE>(NHiFsH?-Z+AQXyPFjP9$NY6nMi{T zJTjzZo4m^%I)Bz_mtEJZ)u7+B>a6O6jpYgM1T`F3c|j+J@Ds$Agw4IUK%Tid{EoyM zCBPq*qb?fGA%)cp905oyNAs9ea#R00NC|Y?N!D;5axmz8kX0c|iU8z|A>91l*B;3N zZ__RqOhM*Jx-MKiyKn-43FKLWZg85 zH(Uem=0&gUK2M%K;RDC5FAYy2hH^T!uuEkT8Y!-q#a|3zGH*si(<~$lx(8DT>-I`{ zRoM52b{63ZS3So4liL2f_4&Pq9mKF65>H6;`-@TNO%c91!00f_3Ls+JnXZxNJ_Op) zhI{K(UW56O$H%KFnABAIk02X~uSiFVbCMr!$I=xo6L(w?$?SpzG3NU01QvW||HCjI z*3&Gy_=YUHa7QS0yy?Z=5qr5LL7kent^RXDqf9D_f8#S9`g;MiIdkWO#d zN+?^{25x=?5G^K&?%(@1on__X%n@xIN@IuH`6Bc=QlDJk4I!h+~@X0OILh70krgjErc$(v5@IBxxHk`k_e*$iG@M}(o+U(Z0v=_ z#l^L?oK=OoS3hWn*9v4INN;jlBI|}dvt>b7&SF=cbN~_4xe==Om8Z=0ZccC#`)je@ z{LxWkOq8oHcUg3<0_KoVPDlsp&7>Q}xNeRgHjbF#Z zE%z9cC^>AyWhGfcqa$Y)tr(=GBhfo-Y*l}as^mm|n(-{uQJ9_Ullv3nS8*oJ+IQ4+ zrp(*J_XGFWk*qN6Fwr9!6sdT4G`L~AO&8W6lYPoV@)67~MzWOgF-PG#hLw{eSt_^8 z&gu}z3wxZFUYeTrAM`M9WjqvgI|OAy00|2c#w*tP`Hpw$-L9Xta|9D|DQEvTnN%hR zWmPtHz*NeT;xz7w)yUD$!*I5@4}UDpqE)GYH{-By9|VrJdk>1VP_g|!Ky;=qEp1Fv z=4fY$)`(H3;xIt}0zfu{)@}L%#dFRt-#n&}^iNiYY`W3c^6;pO=ZFY`A6}z?7ABbTLIIcH~FJ^e`s02pUC+(L( zI@0LC$M}j84TUj0$8Qgt4~{ejAKC72ZKO2ToA+<6u8IZ22x*KBW`hjPvFtv8=!zeRE>vW(y5L#UmZ`IQbZvLiBkd%J7B}DhB74;6he1u3%HA& z)<>OwB>q6dqq$6EAB0m1|B;jd2O$Mx^>p9jcZ9l&QBb7F#62_9_A^j}ZpL6y2ze~D zz|2VtxQb?-x0Bh!8udV6)DCQ?6^Ramvc=#k0vPJWrGz|JdE72vQ>b4s^YqozEDb&B} z7VDKE`8A$paaufy&u|Vx(jnT+7tewhC9M5=Uld*}_a!Cds59J?Mm^r0FG>11V~{;d z1GhbFzf6$Th9NE6B4I!ICc$BV1qbX;mG!LF1DC_^ zdRFx=uLP`;%^&@rBZU96hjGB=d&|9Mo1tJH%Zbwcm2L}dKNgYH*R3B}Kuq4BI66i3QNfPVwrV|I!ZF0RZ*q3lbEXpXpwh=@qaP|4Dj6fA{NWkdI@cv3Pe>~c7@#F$R5QCoREUXl$6S3;4 zTpeiZf?%;-V9EQO{T_;|_1twsd_F!$E_ECVwa2Enw)!9U6irPz2?=MXq(%J4K4iS(5UG};OX4yPQ$JW=r|0Fkn!CdEfBxR#oRlC9Bq!eAFhXmgoN-Na?ckD zyE`Qa8HfMBHkYp5;=U9Mk4@rcfLYFwOR8k~9!>Y;&38Yrer|WBPFdFS^i&Z%jhl^6 zgtv%C2A{`$F|j%aS&e-_mhJVo_v~$Qor!}^n_s+t#dQ>0Bf8CkRh}m8#Z#)-a5lvl z5B2T-ci{nHjwDJ-N{-W0)|zW+=dWN`j^pq|nX&XviTz6++B_DJa5Q2m`9YZ)e{p`S zW4s|M()g&P^LlE4m>qKep&(|FPHDSKKTJ5VM}KHKKNORDnWPeisjDEZEhj8v(~HKh}U2HjbEez<_2 zN`D^~aeGgBcz8w%H5~w7S04u0_touf7NUi8PkoeADp;{jgVdluXhzS?sM(r0-*jrI z!!!9FKqmh;bN|m=Sdn#L-hD58w7V)N>fJ_$c*s~awsv_-rP(krLWj%KX1z6OS>Kov zT_>~Q!SI~k8SGLHSOw8dk#Q(r)Qjvx5*DkEZU?_}hbbNC#&@pVXPaEax2H}o=ybA0 zs(Up5(303IkPEDxlq%P`wPZEm;&^ESETTWokH$DcI;j-ukj%)H66~2hu}eK*V6iK! zMw97$8e&^cJdIsAsp<^P*Oo@9qtoA~O%t_*?l@3NI^>i&m-+)q`k~{>UqP6wK z^m9uzNRE6;lgC+ky#E}Yr7Az15%qx@B1y3O1wL%2^6}9`2IVaGDC52JBMbjCguz0J z789)1m6fBN*#-dX3VwQ=?3fP~|6oor%Tvl$`HM(z>4Ja0K=IKXw|DEjy2ZB9Krb6e zt8=X$RolWqPXq|!(&)y?d`?obqb5^Gc`1`M+H4>=_^4vcVC+e+E#nqR^EfQdbNG7H z9fN4) zs^~kTsMlmt7(q#Im<#uo$^B2W3&VHaKQwEf(>cDblXriZX8;8#>HFuU(9lq@#Kgja zv}2jfYh3^G3A(bzV0(v3D`Bneic$twZ~Vx?-sTYd*26ICDEcGA$I0SJt7|D!CmTz7 zTr9%t{BIV@IUgUc@>ZU-Px?vEI|+t`DIJHA#2>Tu9xc zDy2b-YV zV_3KQ`uWF^T9SaKJ>x5Et)|rSgiE5#niyGdJT1?VuLaz44N6$|n>n`+ffpLiSA7hw z^$Kc7k^xA1M|*Q@7rz0+DNU? zJ)F;JGmxa!*Px1K3?GP9dXxB1)>6EM9M?nRPd4?&QG`$p*BtHvmO;6#HwBy21=l3MJ;$cU=uErlz~3FEZVmSOmpK@-dUp zO)=op%cS%N)))69TGm)K!?CaX-*X@m(dT^k$K~het>fX7HbU%VI3-@+M}Iyq?DFcI z8rvO-Tq!&MRq;|XqIXC5c6d(c*1M1ujQk%D5!60|Bx6^!+I zN;8llem~Dq`1Lm=&TA0;yiDiq4_k1AIZQK2|Df~#Ks113+PgA{@xUV|$4wW!{6yhJ z%;Bf42#%4cn=|WW0HM?7Nar{ZK3HweaeJRsjLzsr3r=hYhv9n71Mz@@BrNZ_HuI5N4 zju4iB(5p0TN_{H`m`i;GuSy=?XvBb=xTV>qMi*^~-p}?yI2wrAtys@OUuV3QPV+!6 zAs4imki!7AZRq}^?d>gDoYS$ZUzKJ@Yq27fFFs;2k#UL#lYb(8Yy*WqL&BLdlGeU4I^?Ssrd#=cJGHYNn8aASuJfu= z{kf{>s^yR@^?6hx`|D)DTBWbCT`zMS%1dt6(F zQd3hwf{<@uF;;kXv3r@2B>+b+uv~mV6Ebn~62Z^0rMuf=Yog3N6@=?Z7Hu^vT#fP; z0GI^@1@*qTrX?AIRY>hv591!`xcx2sz>()I{Ji~wod$0*0P9O@th>b;lYQ^rw;sy$ z*QHJvV}0^=e;Lr>>qXBWIt!(WQ^3iAQVm$2XT8nLc}fn89Qoq#f_8E+$yfU0{n3IQ zL;-*n^0$eAW0It98@>4Win<8>DdZ{J6T7iU;2R~KVFF0Ahm(bIH_~};XsNcE9p;nO zlBU<|CBLPCa>+6-;90%V6BBg|*$-!VLGp@|9E1giNzmI~jRnDzyxre|&in7Ds+>l% zJ?i#BF&BP}ZAhSUI%*-&9H7o>HXa2gVSpr!fLTLfTEfx{VqY^6`_y2hFq%c1&Pn#R zY7HVfK02tQe27g1D;ato*`6PoZOb3bdmQwh{#MjeoFaehA7ucl#Kr0*y5PUjVj)nR zpuV_&k?qFv{~n^g=t9Tu3_zyn0e)t>y|3?o82b`%D%bY=>^2z+36%;NlQ|iR*iAB| zGGv~LGGt0cWNgq}A|!;OGGxk-F{Qy6iDXu7A)yS({9i9RU*C7m`Cb2WU8k#a68n9h z=eh57ueI*`dGq@91JDjlss|&M2A-zVc~5cPbm&qT*CtCtKpLV}D6S>bU0I!RvVKEf za=i8L$**>=ly{sKD`akMcJF!X-z4j~-hq$xB~hk~EMlaV*_3-soMJK0 z@XQt2lI4B>+UET@l$#wc0}JvUY`6oDwKcMH6*+)MCB$y!99^D zZ#BeV___V6i%Zki(L~!Vdr%pb5~Y*#GwPIEccSCkYx^ua17r&WhU%aQrR|OK^J9CWNWPTZ%wX|C*>2;kSYs^v=D{S#@(76CNu~)gPSD z5^kBvhMD)bgU@ZWue0smH}zuS5!0BeY7aOUADQInS3vW9z#ap_ZuaD9i-NeR^K_LX zsD|Q89HN=$1{9G2FxuhR;>s1;!cr&!j2;j3YzYBDf5vk}(&X;(x~nDO|N9mx#rMf^ zfm3Nm`A2!Cb)ENv9OIEbNX`lWn*M?M1=FY&Enbo4K44y0Uq6-iv-r+b?fpIX8r}$z zq$Q+mdQN}*tZM(@ZQ{|P1KxBXd9>ISye4fSe8Trq|5H*Yrsl;`Wh1l4ws`dvUM2t5 ziP1ZSC;N5H{o|IAf2o`tmMVCY*y`RokelHfFIU3!-od!t%F?-{5Y@0*WMn+6Klq2s z1&W(D`L-NC>Z%c>aPfezG0@*_a*FAMaHiWc?sixJR`EO+d8B)Fy;J}Gmh!OmM+$@i z&==73vVF*Otk}onEb=~1-Z8hXCfP^^0Z3ez;>6aRZgu{4Gs?T6FI7o$bNTkJzTd_( z4@amDSa%`+#Va?o6i)P9CMW2-brjRcEUA>z7EVKThZ$M;r#C9Y?DG4mypb>23$}31 zQJ#OkT+8=&bK91^Ua@%?JH=z@3yAd(X_Q;lX&pyPVSl10)b;K;9t2O*HK_?|>mL`18-i^@vX(oMrIH zUUqnn-uUw+E%5&!!h1cHdy7ZQ4_)Em&}iYk+_~BN(k!9MNZoS*n6}iVCKW6t2oda$$vknb6|M(+5H$*kp(#@<&Hp%J743lMo8rFSG zSGElCz>f-^s@@Z`OeA^k{w|tbiK033<}%cNo5WAjgobX$i9d zIBc27V^#M3Fme3|NsC=9{%3|I((DUv{|OeJE78&(V@(_ZnojkJ`=ga-JXD_?;4yYE z;a#^*?T$)@-8-R@LvBv3>G0qG`213)WhWEq%hJHfhpLXZ3r~&iSz?fB`7z{Jn9!b^ zhFPpTRCz1e~b5XV&DHDv=N5Yay7gw>!NBBO|tdO7&rYgh7~G1pEoyU$9sziop>ae4Qq(jz0O zGh-TCw&xA&>~5XyroV zEc^7j=i_1CUn;`^CwUazDqTbzBWCKd`l{VTzAbit`*#;Zwz!nzbaL6T`-NU(I?mxb zZ`vv>^2^|l^+{)snnTb)IS2RJ3fBiWuN@b zDC1FN{tJ;u!S-t}6clHhDdsKd9{DyV)0Yru{Q?*o)qE`=p`{t=YO|b*3$>%`jYoy# zD#kBWJRh|qa%SGAyBn18(N_u5YY7B`Z;c}@-(_PXEbP%`SG1I?mWD=@<3qZ8*Iu*c z3aU(e^}0sd!3WkiKI!Li-oHu!Vm_ClH;?cTsbkIoA^jA0clX?ko66pvx^K4gR4m0K zyn?_~?{$B7crEh8yuFxp4M8BDU*9!rWSE>9ZQpr`U7J5d!p!sS-JM9tT(o$RGT`-K z%Ielq_Q;^F@pWj9-LH5@<$-XC(0?I8?9=S^Y4C@xWIEhllU*|;(s!gCN*_NG} z$sCZM{jXoIesy)#657GeHxonG`-&3GD_oX~)9(l_{v7if=ClI0p}ft$x9jZm-x%GN zImKMRjdedM=<)@O6WRFQf&`!j_}ewkh$cJbZM zaJg8W#!GOk$fm2b5IlE{sS{ z;}rzirFl=dmBg1C9Cf@s^&!u$;w%FjSARV1-@uN;_KvUI z@-=^j2lNFe$1)}sR0C4m$qKA;3>tlsmt}thX&Qe43pTch?icN^2~CM-Go+_}el=AK zW}sKuFDmr>Rv^-T^h)?JF8BV?RCeL1Rmb&7-qL4lLy*t)a|8~Chvxpq$#nm-Is&3 zoRa2BF!swk^!MuapVsAp($A+g z^~DEN2HQ>U*DA!p7Xq(%kh{$;4h8ko{x{ED`|fZY+t7qf!Q*Ws3eMwrC)3IAW zzg>8~xpspQDns7ywCIjCH7T%fyvw2489o|tzT%!}-tiutf|Ex}%rXrxQ{pv3;=KHw zvV*E6j{=E5Iy6{Gfax@IB&Foh$5vmUA*|nrv4efV15knV@{ZftI&7yXSB1;i(9zPK z%Z&tdmP_e+cE;oRi&!EB&Z6(%zlgF0rurKsTNHZrmBW+9wMP-&JNU{6KBQYcKYLSt zBQ;jg!N0h;7*$}hVLD%BYfq#Zbx*SDiKp(ngXJojH}c_h2Y2lbd+OZY08^6imG4Tf zDplwZI@s6lIMxt1TI1uXF;bKuV|VuYFKh!0c|Fcip zLB1`+?c=4OoqR>PeXsf=1|tV+3P`(b)Wy|q@%sOGN7$bo%wgvZ)_FoX|8sT(WB63YI7w=$U2>ZXE^}q z;}YthUd&^O-|uklm5S+G4*hIzaBh5ZtiAvH;a?>zDw&SW?;9I86)>r*Wxmpa$Ykj@%z9&r5-F%F{XXBe|d-QWImG1sklg{5d zeyVL~X85dTD{kZxQ#yKfZ|LrD{$q!)Wv!$UQSRVb!MmS+k~F#f%mUx1*9U3`CtqG> zFAI*I8Hy9R+&4>R9_V^}&i#qk4e1ga`z*$p&>rSg;Xb08+tcwYVcG2KOldDT&|*Dx zcRvwA+s+pfe|nWc?a~OSiARl@)6Y9EACSy0luBYj*s8#ZO5=hk48B>~Zx_<6udAlRz2`VS{%0xJ|_AJ0wx5F#hW@ z$!4D;oLlVmJHL9}qHH8Q7Jq^#OLY4m;Znd>T$?^uIUspt!7T=Os3U%?~ufYtvK+_@~`TlwE&#Bss`x+XCafuM4Ij6qF|Vfw*}FiH8|#VabsU_?+h z`}yO<%=8u;8?oQB0H(gv9arn@FweaWTP9rDS8(6o$yVE4|6P+~M-MSgdQdxH`lGsq zqvAa)@A>J)#fXh|wFsmr%Fj2PcOw^3DC<|1o{L9#U9m~d;96w8s6mo`k?Tj&xhu!5 zmXkuoV!R7HFHuF_%#7s-R?8W8eUVL`k?hs)St4Cf-4S{g!PU=dM#6y~gNj|>`1R$tICzgjrz`e zRHkZN)dqZ-l)L*5y{jd2Z}AZuS%l`M$%T{qd4o;mj>U@TIu11U$hSM*AFlgpqWQ9VrixMa2T-Zw2l#=KcfgI_Aw9A3ml6y(jz6gUb9nLjOU5oq!*v$VCOrlR1=fBxomD*l zOkfZv2HFZ0Pkn4@x7l0fo#Gp%;MUvUM_(;uv7jNzU@x~O=YhSOd6A-Awd_!%En;4M z;rm&OLlxS|``$huV=4_1^ssk0JJsVySF{i1qzHa-`Q_ttRahmq;mx%;f-vxk>B=gRan4b1`N2m0JH z$NN6e>Ix}Yf4cy8#>$>>F*_ZRt-R;xrwy(Zm%V(Od-CI5)Ds(A+4BiZWy=9-sjNwB zmF@Y`2+`|kFoTS4*9_KW0ZjxuJtOmG@9AXg<&g@<2D>j>?Y*NcU9=m37L7|4A@Px$ z#-dskLiYC;p6CsYh={oDiMJbn(?2RIYThJp!HFngO>PaLIu0ksO%wrQ$Qt6+`HuGp zXQ^8zk>Go1!4ZcJJUuCM`cte1<)J&$GW3y~e6#gGZ?=yS;eXvQ zHLjSo^2nK?FiE$80g(Y?e+Q!sjr;y_EOx6dTJ4Cbb+1otd-5}P`%K}4%k8?QGm}zB z-kJ_vv=XrC@HHa8k=V0m<_m|y=H_F*wOzIEc(CjF_N8YxXGJs&b(INcF$p~LEAKn3 zZNmxyeYxI2X_gutHBfUp9IP3Y%?2w6tNMw^m=CH7zL zsa_4@oG;ES>4<3JSILUN!t9R_eMC1H_T3`j+`0ta2rGwIX$xpjdPLzarlu@S4HK{a z<3YljciuD#I)2?axQqLMV7sWA=0OiZjnxC6d(p@R6V61-yIOqeKaarznxy4uF91C^ zX!#9vHE!t931U3GecS2nr$L_Z`ciL{44%!cHnZ|4(IR%~XW7?-ko4_OAKnEn4gg6o zEQC5ztuo`)&k7yA!Yb{AK`0C_Wg&tI@!?R=&oV44KNk;7RC=zwdW-PXqiq!KKxexX zx-3&CPQDzk*GRQ+m$ca>oV7AyVZw#{<5iLm-rRZ5_u}fR9HSDUolKl><=A$qQhfC2 znK37rnVFFxG_s`jrN(X7?1Z7irBl{KEb6IG*!607Wv2LnSE=TOK`h3JJadH}1-9j| z>0Gg%gm9U>+Z(!J^7#gBk7t8c&u$(9avY>3kru9gr%=RHm^Q5MqkmIH=Qgi^09Cu@ zGw3n0a)EDk``$ZZV@5+^(f#WUQpE7MpMnavu-u#!Q|j%-5c8!TUy^UTV@M>mP3rgR zaTBZexpEz`dNgH*Yh(^z;YxscY5RF*)GO&bI(~e9je+OEL8C#=E+BdhGwefQp86BK z`1uJk#i`x6i%Ahk?T?VCf$qgT5-Y_+nb37$YD8E|OpQDzemaUZ5DDnN!H?l-3d((r zFbT61(@nVF?SSXnQ&JdBtVL62P)|My?Vck{YI0d`^HST8`lX&uZ4 zecHWxwe?F9T&pWKSYI&q(@XyVyTClhCKZ9&@bJCEyt8~bwkGVssj5L1a+m~=BFAv> zaWU90z)bg#&(C3KP}#8_M3AKsvW91)uaxZL=u5R6kbR4_Yq#!Ha za60=qga@bgqAg~5$KfgSr^g)fUj({GC@P%zMQrYZlRx;BE&CQI_D# zJSs(|mN!-}yy)oQFSf|Pn>Dak9SA;Xi9n#cgy_yiuDhd-!NUrjvf)}AID&vBtL#k< zuAc@d!$-TP3ZYYa|Dl_zx1OPJ%|YW$P7)Xfu>JK%GA~svT*Rza9$Y{@usM&?hA%x} zJt(3R7x+|8b@M|rWo2bRK>djIfjlg=_U5jsw+@RwnzbL$e@HBbTF_ZmJ{V-G`}v62 zz8K2bb;+W~a?CHyfoh~_#;z)W%Nf*6H#G{Pq=+?sAQ|o1$ndQ>Csiv#?kg3o7C7!f zlPR(D{>DlX*7nFV(wsOimURBmecXC#;cgSjOQ()J3t>UDQZG|Q{ zxl$8hVn#X{Ow^ryr=`4Y30Eg}`vt_l$_V+rCtTd!!z@hdi1ugHabvVF-uN6--9b1w z10M{O8YmCSUguIz+-dM|-`4d%6A5SP?A!T=U-;NRq#Zb8e(hOKyU!8rO>`M&5uTII zQE`>`OY$pP>YWhSuRT0GaI(}#S{~c%1-zghIX5{rl=NyH!|BtfhuIkPRJXdgq&ekymnv-f5r1)T)}O3wKw)H8GjB&PT%Es;hFX|e$EQzs z|1zr-ANPmw3bQbzn4Jws9zRAZJH{*!7se~ zmZi~SYq|eD*ngdKPO&r`L}l-BTVZ1NfJwG9IjB^G{e!byzY%tbdk0U`)4F!ODRk}B zNU-ZbcmHOKLeKphmc-qE)?nfLh>L$E(JoMFO5%SI5=rJim6dG9gp5r)x2%e$uyR*$`Nowtx2eB{^C7;iRlVJSc3a*5=*f`?V= zDJ2rb_$^;OtRJ8tFE0yC(0Mtu8K_;AJ>I=<>>y>HzWnQn{(Mx{JAN&SAMWnFoa5Y{ z$b62G!x+RJ;_Lp)f@JtA@rcp4%MkIOj(-&e%)aXf6K~rYna*PPD^c-i4=l#q2JQXy zRLSi)P9vMo=JtH~;)cH|<-XER38tzuXU^PU&NCa_f1yq^Ehp!CnlRdG@j;(23-tW~ zJF!P+c?k02`G#H1)5ymFXmMH*`XJWlGrENkSmVLo?9C8jGEn{e7NDguhGOgPG8qX% z0wZO=Rr2kc0imk59wRK{9?f3 zm3QT;WEK%{>7P$%8XB-;3jjEp^2>k#wj3&FXR4HDHB2yzDPL~OG-({>aiv<{o;`B& z2@I6;!rQM4O;ecu!O=D>@PoI&%(Kt4k;K3zP$S5umM;3aCXntR!ChP{!-ti3jWL6iCtUdcCI%C#@_?WzOh&QZfel3P( z^o5n4nOOqviEOr`0HXQs15Ve{S1$aQr=b)#k*WT&aG1V(PriVRtMv3f&gA=S_Ae8@ z@MZ=wvi$t{^S&WFkmBZuTHWSdwQShS_Z%5yFg9cc=nIkN_C~ws-&FaD$cN4gr1fnE zPbh6!>W7-PGA)-Vz!`RAnr$w}KG_jX!B&XB8Mygre*OeX%lVK=sbt(&#d~;aCB%li zW|ShOz+JoR6Lfc7-H0iO2?tHlvp3rg@h>~jZx4lAV4$NTratg;C3S`ERx$BVVI55x zMiDXIfhNjwgxv$kgZC?qTo)8U>6Vh($9(1cBrxCgG?uyJb633s+ceSb*~I9;)$4=} z6{cSy3`svJ|8t&yq)=&ba354jtKHt1FiGP69ZCEjvc=rqs;BOX=G!z|f1jyw^_}9f zJI5~BqSc}2-2;RF>*TESMH#0_ykTlSp#0C87}Karxg0%e*O(EXeM5nc7M>kp_c7moGBvg?DAKZA z`}6DTGZO<(vqZ?yfpOjMIAUVf`HF@msDZU^@SU3Vjcd^Yy6=o;;Xf8!M{2!1 zlfFv(APjCq7}A!&Q8+`lgzMMXSb_|Ktq-b;-+=T_tL7#$hE(a@-NE2kXf@{g5CQ4S ziU{d{O~0(5{ipZ#kK9~OPGt3$cyoh)*LmhOyFZE;q_n_1qXmR;3@J)LJ;>H%_jL^Z z8|dsjF*bOpZ!N4RJTixIPx@HqRf5i4K;m3i104@dp*>n+5K%y$onxYIK`iw=HFx3> zw`J%vk2E-ecd?Iw9KeG@YCgLdUO)lFIk;77$!rA#b^#ZhWb!XrvpEw5=lnAL%elUA zr5OD?$Hx{{zXop2k}Uh;>9G4CsG2Tik)6> z2#?_#_!_}x%zy1a?=E_;a0=Si^7HcxeBXZzx@vjQSA7-cb6^6`S2l*@Mr6&P=F>4cDEfG7YD^_fq^%)qR26`3ysB4MR#Zuot??f*OPwgcl7Alj26_P7A zSkI0pnfn(1@C>{p9WRQ(HC_M`RMIjp1&}51p`_ywp?-AXd?@zoBzS>M(X_-obQuQnqD5gMx?hl}?F(?e!Tgyf zTK}IY_28Le6y))@{3ODPrB06aM=@`;X9qBS$sPzc347=D%{#U5Jnk{LLD_Q$rRlQk zm!3JHuDuyyDKN23agloQ3Yo);t7U3$&-M^$i-IokHgkLCzi`oUo9Q2~c{+VdXJ;`erY74pH-l$;0CuPXkd}5I6@&FSsTdX!NNS?(Q4eYB689g$ zqW+%+MFjiF$zSO$?<75F^WsPzVwoZp#5DzYirqaT?pFy?rCA1k6dhk+lq)1h2aW{V zrYtov4i3-Je(!G|q#g58HMz$48JCNFPq!3xa*Jz{!<+3!^Y=6J~n zA|})5@!_V|&%cxf@0z}d%bIdNY?6_lJwUaKe<@=7r0ZFt%?p)H)60tLjD>WIYX~dT zKkO*t|9yxnhDg8$+me;$DMTYYG{5Q1mS*it+qd-jJtq*58}r>$&8==lU<5-i`f)PV zX;A>OJbtiW{%3E4b8FrSPfgG;YM{hNxaOkMFH;)zv+wF96o5K!SQ#W6YR9Q9tK`%aH_fgVB0;5{{-r6&;h3e5+L*KFSS zoTIF2^+cuwIJ@ z?YD_`a&q!GTsJI{KaO_tWZ?cb^ie4TDuM3mzP z<+0+~0VJajPy8M{=z>5VFPCZD1wT&Tw{N%Q9RIn0aV|nS?2Z9~^!4?^r1l*9`f`$i z@I_X@jp-KA1$T$NioEL_!Vz(9Ix%%jD3%IbGa%+&G_&6KmNgxJ({h_EZJ zRT9*ti@A*ncD$WT;Mned$;J2-um)FG$_ODo{(h(n#XxCSsViVD(X{t3K;Qg9s=!|g z8x#92=#oeynagm4-@dtK$GKE2%KEyLf3KKZ?e4|+)PUL^-RJ7s>+a}ktbUT8k3QVl z(%_$9L9&_UiwjG!VD%|?p`^YwsOi8g6w!b=wbh!4b4)t)b>;u(7*NtkrV+9+Nt`9 zdg{*0;ZdGF<(3>Iw9~Fs@hNai)M0l5y7`^+H92{!ii1?vdV^FUy$lG_< zF@IW5H*e8}Uy@3oo-bAJHyj<7LU>n1XGa|xPI2F~he z7-+2PMwh11{oO0ebm)(T9k9aiD#A2c&!D8F^y$^rBOm-ZW#~(V^P$`q`scY1NaVcj zhB=0aP?qcV6?u7xeBtj4l3i|`8)KXWVvEb|-ftgN@=i;Df*{XFQIBqVo1`B+hbDO9 z!@?^Lqri8T3q5NISqcQA86YvBjQ#5fIJK%@)tvs<7^uHK&AN=u+1a^|3UPYF$?ukq z6*e!yH->DW58CtHXxF>}?*3{%1_rEE*d5oLcq&V2MPs88EddJ}Qld6jBf6}Tb9e}X zS%cDl3)%caTer^BgSgUkK7fPc2R;H!kvWu(Sc$6k#<$aq9P4k@MiXZE*{3ff1w1|D zJ5(hP7BQQ;Nk)G5*l&k+uvp6$ZM}x;(WylXH(!~2O^OBMiTw#WJDa!tF3{({%YQ;W z?1iA8hX)61-x&2fufUGT-s$96$H5n9GDgq{eJN6(v4UZu1`@>ndwjSGVOgIj6>xVK zTCdfF_y-cTXdXe2zAEfSXcC5nAWtXZPz}~6dsps&{vih^}qB=1%EUAc4KNq+)*q_d#M2C#x^4 zggtf)eHcncN&4a{Dk^<}I=f65*Pvzl-_P1kO9%)ErVVe1S@^gfu}~s`(stq*IIt7m zm?w#PA#=1t&3I85hm;FONLG z{qFV-Fe1RT%081%ep0VpyM|ID#FcaID56&cX@v*FD;n^e$2B!?Z|u3aOc+tg%1ZRm zrupSyy;BT$)alOCXRiKs2Bu2gIJH=!9RT|_jS)+m{K@KI5VpRuKfZZ=ve6KD;-e=ca9Om<4B|Y$$2R8cuKq>d6Dbx?%?C0eNIBL$ zyn&V-e15+yCN)}aJm4TL7E{KQsrtCg#@PA8Xu@#|V@S}Ho|z_IFQ3W+ZtKmI1O|0#SP)abx^{`kCWQm_G6 zuuwe*y7ttzx3|C8j)LPSYLOX1_OpkV3uuN`zOdqN)H=oUfPMc3YNLYEW-{VGT6csB z6&>hUxV?Jcizl35UBmCXhxj;^BxqpchIy7v_0kVCUqoZX z(3kCKYfQ_G!Kz@)Xpr9bFIB5#bMU#_(LM<5cvw?Id;2nCZ)o!oGh)s9NDNn^rM$<7 z2}<4FUB2xq;l7#;E(=LId3|9PK5r%Bz?=nH1#UP5la>%2n;T5q)PwDub6 z&lap=Sr~KNg>}n7d#Ax4wd?w|71G--tymH52*ynyzqqPOtGx6N(I9P!KLAFVw|-L= z#wx*Y<13hJd}KC48s0-${Qg&hv<*=fQ>G`!K$rE_C%J#E6Op}B_tW#&_gKBn#Pq7a z9W~9tp|`6C8mV{x$EEjI9`9=!!Fg+X2UCse9Z7x>^7@2gPo9L6xaT-46V76n zq`ca+N?z1yP$~m=pMm=;X(b$~xqzWTc7)M#ZakCW6AIwS73vB#{SfQrr9?@_S;TrD zqeN$athOL;j<)(We=48--gVU{%%1ubPUa>4jv%T;1o@$B^#&0{$R9QW@R|3(BoPGd z_VVgyKu$VGOj7RsV?*vN4&#+$Qabf>x6@Pur(mG)a>%60+m_|_s61B=SYypwqFN8u zBDkZ&GE3wc3hqvg)a;juR*td@MGnF8q?Ba7$Kh5%R8%YMWt}TkachK5Z;0obAeMLV z=!2(AoNWXy4x)9XmaffpZ1N4T&L*}p;RgbF?obwN<23)q(Up(bJ2E_6Co1p(Ocpxe zca0fF&d!A6RVBLSm%7nmgC;F?Iexq)>xcnN67fM{w==ER0K8dax`PtGM)*I?*253* zt71|D6Yt7!TrxzPH-fDqYRa=BY*E%NmRW0A%8#(c-7aHC8IlV_hchy*grCt8`&puK4&wxXEiqc~#AHngV^JP9a9HbSv zM;WaSoHSWIi(_D2G5Y2ZzG7}o!DsrVrlwkJJ;h{2MZ&=i^<&;S#hPYMtm*h?t3j+O zvp*q-f+*k?+UVYID%%EPC@%5`h!XU1Cj6zi{0&hzFZ+eTDTtE7J!DUJmK7EW!}4ug zbL0s$8TcKdUE^-FSj4fGY93;L$g6dGZO#awa@3in4l3`{R z7cz57wcecrs3wHey**&wzdeTZf%*^AkXDTKCeD47V~m&^Een(Y6M- zY9t`|(ks+`b$>Mf{qp_UuUK@+-lXH#BKotO7?#$3_N@6r9sYIVeiv91{p4f$FL>T) zBa4^^-ql+gC$O%T9CvZ}8i{S|CEJI!r)*<~-d8A%M#*&aLnSa>vq{i}(f{epw27-FTE=6dA5GGiI&OxAre1z_ zB!T*R-WjI06*=n65jT&Gj*Mv4R!A$%@0TY5#ZXIuqz@Wi7T|tj81(dj|2Klo=@+5M zE*$l0HIgOK3+x~Yj={iK5SQJ1L7oIDwON7q0UE8zS@v%nfXF(Rt~YDS1fM`&4%Ln! zH5MNI93Us1ppE;PoZN$4M<2H;2Qa&85SOxE{m};}_)GT-ZkjX+SxL@f`x9^agnbYp z4|Y}0w7R!1Ss`VKde{E>;A0V$>x1e$z#z?y+z4bR1%1iHNY*mRK%MpWUX3Bf{q|vh zZQC^VKovExJPzR-rmE!I)zbMZrZZ=Ej6yzWv&bn9>uGx4z!nlX-!S>{PMJ+p=N2wp zDEQ8grDRxWuJoK=t=Vih`%9n$X;A8Bh^T@6O4`BwZN&xd19w`9fUnM-;FJ?_e4Tm0 zlUQXp>~(=KiX4Du1uq7w9S(@!`|pGK(n7+^4#beFEZw!omF%`OK zZX|l~Ap|y;@UWnBP(cP6)EaA0pgT8bSGknI+gLR8MlnFu?O$N8KbfhtYU;34V_1$T zoTTG-4Lndg5IX9YWVHCi?Tt2%)(gZ<9x}r4zL6c6N~1Oh^$k(WmU-u3um!4q3^LPh zpH=t-c3Z+;V%&EcAFrAFBXEMW#g)>Z`&JW^VF-A(eAxwPycc; zE0o6vpoM=#J#3nR=|!eh&sY3pXNUo?qseyueC&Zr63);mc%7nGT4KTx2&`_R zu@t6b^ft%TOJK(K>AuI*=-Zy4fiOm3x--2s_}=gH96);e_N}YvO#un6_sWGNu_6f07RWx2y(pnas!k>v5V-)csXI77lnCR%}I=&h?jxO9#h(hH@)K2)01R^NAcVDsQ zg0Q;N_a)j*F;Wuzz{p%dW#*3}P2S#mP+DKoBhwh z`OohaT>Fv94HQo3(IGbiy3W^0S^f?ik%hvPKRzaUsc5YKD*^eI}&cYK^{c~~O zY&`t?7V{foRPq?y!OfB^sZoP+1^K@5t6s}bLFB4W?9&?`7LZNw|!;|)2C%G*5O z&2b~Ei=e;r(vB_L#Ar6jJ-qGdPw3IIOE=?X7KXW=H=OXQ6&~ z>1zn9U+YK7U|bF77CKj23OpWvOoSDv2e1Nq(7&(Kb_T@R13;TWdtBi28d}!d__zy# zpL33{gF|x5*9RvwgN3f*08!a&Bi8=>{yk0cwxatA@PyXZacnCvX$!)2y#x(Og4YGA zBS7%ZojZFP_-6Kb{cN-ea*mlj4LNKdPu8R34D-Ec#EYYw+b7w9A$Ieen(0Dg_HdD^Puy}^G^q;Cgo!$Ot3q>U{-NMLxcSy zu0F=yxWovyu5l+~z9k$B!ysCp1o$O3#adh|8x$GW1u0V8NSlqiN7^ahW9WVvRn^8~ z!_a&`hQBo@4Dq)FR;by@mYg8&zIsMflfNIOWd|Nb-Xqos5KJJc7^k8Id@>v*D^qR* z>5T<$P*Q67*U&9sNZgfVjLY3awvmmKPh%^WmAYDas;OOQ)hvBn((W_inBO|QhI+sU zk{ve0lcF067s+3$qn%5;ZQtEr@=HCgDh+8B=T9X4;4O7j=2RK_*=W?^Ndk<#K_!V~ z#rfV%V_)~{<9Qxa=k#Ry`Lm&;L21*wvEwwx12(ymu(P;Se7sQBiL=bL^>zq>YJXD3 zRJ8#WJ1zN>=#hP3^I4RC#h`=}krI~pUw`dvEcFX!B220Fq;45TFd|OwTIf6+Yj`?+ zD5n`L1c?4ajpE2>K0ZD==P1p?M^T81O}qVnEN?*edM4oEdN35IITcZtFw`igbf{uAL?yWVh&_rKq4CZ4O zFxZL^BV+!y3m1^&$ea?U7$hmRYE3dXGQlxq{zu%k@)|u0(L_#V>t4nnfV_}ZwCd;} zlS6A>7Brag-lFlZNvP@G=itL9k_L^=SaZRwqVbgCad*rn+WXam#h%L!k|T{Nn&N?} z@DrsM1_Fd2@HF+8X#Wmp#o5F-8A7r^bzA6hN)dxig#rsjU<*UO;fJ8jAxBs>se7oi+loo3ooC?*eu#AL1b`S@Voh%+`-4vdb1 z*D-f;gOQ~d2@cz@7+_Cr?pCx0TC(TDU1+mdqxd43+Wr&!+LnP11>EB;Ly-R7Uu}l} z%rReh*I$@z6&l-Dhl45uxhghsQ_^Dk-Ch;k%of|Z7V|5JwbaF0{)X^5#R!us@cR-+ zaq#(I**;_vK@J7YP_fWmvvXGd3Q~W)(;hN6wmz6)b%7e2r4JQ~I@P%R1Qxz<*UN%0 zpzh}~FK;g|J#i8`LneSb43=!`*g=AK(-w>z&~aTP6Q`I0m~5oUhn(*8?=iAq?`^TM zY9V>webyyw%-u`4u%XthTxn-}Q%8Jo>6uyB>%vGgHtd2xn*G4I(DQ_NJ|$7SvA^uk zr3RK0d`9fzoNu9!)R*$B-bz?#_Zu8bh+(Cfo<0#M(eKNA zbX6H$sKP(SQiZT$Ak?v!%Pk7JBemK@PGMPbG|Qsg)}Je8MQ3JFcYM!48VT<%crzWq zo_zoy3+_Syr(huN#8#^70O%XbdFp$|)?B6?eQZyz7t6PM$)MM`#M30p{!pWBcM0&S zLX2OX^vQw3(d0d17&2TTQ5r7u0JYs;jfG!maZqHj+kXwN!qk9Io2wVbe{_1LI8dM)L-B{}T=Pwc@dO+hAUPD_BRo3=J19J! z$QzjT3Sl1Gaj#hps5oP{t>|gco4wa%_PhmJ`oof=nUXPCW_jW)95sMDdWphDTQp`p zT?s+0Tf>)P%2!otGp1Zy;wmFk`%a7cG?iXv0NM1MXK z$6F?Rc0n}ewSQ7hpY&5}x}M!uQ&WS8^rX7UC?Vd@>q$+jg-XfL6`a?pMw{sUDW;7X zM8_qn{UF&)v@2jIYPfBMjwBFOd*DI&i}{0BoJ&TvC&@sEQ+lu7u7Zbo@RS;8lVtAr z`JQMkoadrt_;UZ7?1Ao+hw^cwM!kAns>M^d$On6F)ZhEh2KqBT2U^mId28}}2}?j{{?Oh_{-@2zIy)$g zx17N)72zV9@=7w*;@6dxGH7eQHP;P8tRAKKN-<1C=m^h*-7zUj^X9s>z8YuGpEvny zp*?9>rkLmMf!V-R`Ys^u>nNB}m6^$7BLeGX&qsgCMg+!1k8yg#B@Ur( z3w#DmA(S?a6U|`grU6MtxQLsC2H5f+5Y#GFT6J*Xk=|Gh6G|&8T0&5l%_3fk4pR%* z_Y5XEhY!3KkR&iKiaFSh4lX*aues97^V##sML#9AZQSdBK)tE3AnS*T`D*$Mzug67 zmf?95(<6u|E=G8_?Y0paH;_bv5qR>`Wy$+{qC7zdW797RM@8MVBKHI;dcxt z(!f@VvP`HcsgZikcRO!&6$j0jLN4e>Hxcki%}EB51d_rnKi`jalM<$XHaip6qI|NGL>e5G9-Ij+?~~mD5s2=F z&UQy{LIDuSBeITQDy2E@)dinPDd{puRUaGLm%;Q2*2u7v zQzH$6&tkAsBz-rj&K7Jggizb^eWy(sf(V(U^U?v@yUNsX@YUS+iTgFVcL}Vz&Yuzc zV(mY?0S{Us^itFtSQK_o64@U!&l%fgRXq0=RigXO z>>1?UxpH}kl%AeWaBSo=7^(h2SrpY;&JhSPe_}hLvMI^sjWI~Eq8kfKlT0r89O>1lj5Ie;|KlL@8XTh zdIHaMcJ(K$0ssvWww6giN%aX^K~qT3y(koIPx|HEe5wF|fM$c?EugrH3ySf12&(j+ zFNTz4)seDHQ!Q@5Hq8$-^X{ScTe`5hRbh@S^7EicTzMmk2E^&(6|AoO%YU5m*>s_;hQ!qrU<>McJ} zlI1@ZK{E^(M3f03V6}dX`I|{>{#$PHYi2}^voo;+{=9+t22bh-jU1knaS>oE0X#nM0TIg| zvLx=B>+@GsF>;F$#mDV%^_@oyrKuN!3}OUVsI6J(e)vG&JVFS(Kni-df`cQ=cV-&i z?GcD_Zo3GVamABIP)^+xiE~3>jnk@uE|sDL&zlyO&@BTaMvXFt8Ii zm_}QyZExDNByJDwbQ5%Hh((e=4$o(&rl^T z?gQ@yC&IbeLLyC^H^f?CmmX0!!nT+_ag6+l!#mH}q^(6Qv1BzfVZnW~mS|Mj^z8akqGGSyn+NW8@IbE8v_=u?UPCt& zLzU5c2y4QBQqWu;5+U>P@e$=ubgPD&Kl>u7O%6ZtXnH_#RyYCDcO>3q^20x?@DEN95IJyT;!1 z)k4wdD!STrY_y7W#*0(tTYvt11H6r7+z`%dw-BRWHy(S5hFA5M9x_`-yz3s9j9d*( zue&li{BAGCKWD~~Bqb#)lu?-Z?e_=gT(*S`S5syJ32Ajem-M`ZmYE)ROxP6tjwvb? z%_uTPeE~kdP9kxYbuEO=1tPEBRWkjPGk!`bniUeiI>Rh7M=-=<>zzdohx@22ZfW0T znM*wVV!Y}Xi7wZ>S3DtzPl*Q{V@4&dt@I;$AkChjB}7#jI7#;T|GkeAFEn;;D++$R`xPna%2N>jh>$0 z#>Q}{+TZVzupfdJDU?rKg0OVs^__z5p8j_;hAsEsH$xdgEtiPAwwtkZ)vutV?zr92 z+t=fJjc95CpYvodQa5<}+@!ru1HoGTV=`h=DA5Of`TpSQ{o?%!as@87@11F+7K(TT zYn)2@>eh1jmv^%(Sw`Gdbk9{TS#$4^{bR<^Vr3rGKm;NpMBFm8X!JO&T#5g5VW;sJ z(&H}YYi@hehva%~gUlEI**y6F@rz$RBfKJUt}Q~1*99nfz3w!LgZ{J4LY`|JvT z>e4_S)r3uc_;*_g}qoC0YAk0HL4?v)z(9unp$E#mho1_#<99 zu3ft}VEf;dAipKR01h24zsci2BOej_UAEFB?1ghDcj#y{co@&0-|PUQuydWAe(3k>|05@{t2#{m?ptAR~@r1Tv>cWvyxn~EGFmX)@B^i^oF^JUEC<+?COOLC%ui%4KbcPdD07xJ=eWI3D}v0 zo2{*mAIGk~M@W)!$$r*l_cg=b^sg2;ucgBT?O)OY*uXJIWv(Ue&zD;q0Qvs^dhtU3 zz%Y36x|+g-+_LF}NBK!~_kME=)uGd=p_~X-EX>?NQddb^jI@8x69WMK6q(T5_5=k0 z2&)=?z@+}^x-+c&ff=M5H*V;>FP1uwzT#awc7O-uYHHlcyNMPaolHKbBS?+dDnUcF zro_1A|4#pE^JLY3|KGVUaxwLX@mKQ)2CS&a=sLVlZp3aTwuQ9E(*d~EM7CgpB+d!LmIGxT5ib^mJ2wJ3&g@*A`1Kt3 z<5DuxWi(Rns%pAm*a_fF%HR1}ZeD0eW<1_q^DNLx3f+>fi<6CU2Ua;FCUJWMZ5=pp zfGA#XmqB1zKkBCW=uA$VrzS3KK|}}NVBAL%IcWLhl9w+aeFo^f-;&aw?-kX#nvB|J z6V+L}>p#5ODvNX~X1_C;?V^hY@Ju38y=(~P3IoIP^{qGqNsdE89h>cW6n z6Q~pT9=rgl(75m}S{a-rV({$iEj~&n+8u$okl+cgdewR%AyxYz+*`NR=PuzxxhTrZ zAa3mT44wqXD`MWilruk;to`4ez-ET7pdG@8DF6Eh^H0WW?hgtEI7_^(rdyH|MIuyYie$*F+hC?hGOG}oN@U3J zuTPVP|9SrBKJNYR)%U&M`@Vbcwbxqv<3d_01Ijw!%1K7|qjxOTrbhBag5?o*&xUyC z`DU%ao6rZlwGFx;t>TsAL!nUC6-AlByJ3Bj(hY(T!q#6}7cWvqz z0pzAB=w24uhOi(ofByb#!Jl1)@6Sd)%BTas5Ba0pExuE%e_!T;vlxTqM&M9xa-{QU zcRV;$w^lmg2palOO_ve38}NG|G(>9=$IFYoNI4Eg(kR;fMe^BB-LeOsaxjOlu2!B! zCqm9YiH2wsz=kwcZL}kcid1nZ2o#BRE=`_L7}`vEB_BP7BP^UvNk_6@-}&^9C7!D) zgep{cp7^65NcxN&j%QfF-U!?|Ts5Md`=A8G(5YFgT_SygFVN}iD|p{E_7pG!s*XL{ z@Un4ZCUg-69~t3ogBn()Sf(gk5!08X#!eMaIYSgKz{NFqYem%(>ehKD!H;q7t><36 zSh6z2fF2p=Qn=Fy3B5sPs)vMBm_l(soPPNLxjl1>va(2a1gb(-zR=w>_w;nBK)#0= zDKta>i+>IZoPH5WA#%nu2wA>t<30Ad$0+ma%RVDq!;?T?3I7*WVVX(Jo31V!=Y`Yt zrsh)e6Vs9P5_Cx~V#3{FNPKebB8+J9V3tIS+whHjp#%xdzNPtD zktC^uQTS$UX{owD=U~hgSh3tau2hZf*VQo1hWWlrzCLpl@yt2{#(x>qN9-AYWnxwi zx|-=g7jnSWwvt5K%1XM4!>w(;RTW3G%ex!6$<$S-ua8eR!h^V4J4XJ=O++nETYyf} z;HOGObt8Jd1y75Nq*l^o57(oX06_x4^bSQ`q{7g$u0e~1*exB5$moRm86ou#+8sIQ z8)%|^Z~wMW%bxw|@9(dK^JqconE%TbnGS!%Uugf324YPzpG+ag_fD$stB%o1vqP!b zWN}>ydWABr+QPyWcV{b^W^z}S8KLx*uJ-*JPW!~?a&t!oN)2&fgc@alU3DWNdA;dM z6<)j*202@|s+Rk4*NSX0q!ug;X7u#2G z{|({=Rm_qRjoOogox9Y8X+(R&${XE*F;qwxE_N{EzGWN{V~Q1?Rk3m?{kLPus~hxR zV``&5rGz(B-bF_uT!wy|5>1DMw8GzJE)P%iR0ZsWZ(0a!RZ#)3DHOH*snepZ1g5g; zjqIE`e$efemL&=l9=19~u)u!X$~uH6#}$9A`0mZ9(GVa26%L&5!d?Y})79I1Lznk} zF@wJ;Syonm^OvCYBGMK`RVh%HZtBwk#xpwX7iI=c72Ip<+*8M9N zn4u3(JFO4CF8U;5YK&sH3oN2Y*~`8`7IfJpY@uhvOUo%qN})Q04G}IxjQa))K0&TX z>HtAW(V)*fV7!iK)FHSBNtsEGGU6jcGK$0z9%Mw-J>xS>=TRGyW;u6n?6(`vmk#pV z)}Gq|;sbc38ezNxs1IP?^E@-?anHQu6*M$bknv-y5{3owagb--)4sXov*w%c{L7zt z*#?>?ri>4#y?H-(yq}j_orY=wZLpBwgtl42YLUIaVSI3v(t^^^L1|rQsKib>m z80Gk{S)_y!BtSD)8kZhFQ3T}~^dcP@;AxKD8FHepF)m5tz3ZxdPg#UjTX!@@=lfFn z6oK}tK%}R(17wFT!ze+k!+51GCwE4kZ z**dd#nN8#F1%-v%otAwKF{r`NLVxiXJnkNNt(;(;t9q0Q=$?^@iD-|EVdkIw+?}%l zLYrt%E}U0mqaI54I1n@&0D#lIwEGKx-;0gA$u*!KHXUo%%&^V8e+s|{@Uw;#OP}hU zVXVtQ=hR7EFC&u*GF_XXaEKi5?~YXD&2*X41MuegxX3pNtf+3BcAx%O+i<8oQQO{$ zUO=?)zdgR{Mr!5eFNuJs;25ZyrT*Yuk>i|UV3?_C@Q!o^;Mmty5WmRC9CRm~HJuWQ zVljRtaVpg?J z8)Ni}SUXC%<~i_DVZWAY-(uQacKjOunt8vkNT?2FvHQ}>ReEPxgh4AV1%0hdT%}~r z4kb+jnjxs?(Al|tRS9|0v*D(-g-tO=?Et=2L$7&}gSX~J5w(oc%gfGPA4dXD%%aDK za#%>HDd2UOHansn{C8hmC`I{T-l)Wzk+FP?GBW3KB$LElm zH8xud<}vK=ZwavPz@&%5r%#(R7ae;$R#O@z88-E$tSi9d#%BJ*&fcg>-_Oga>s4d^ zeQg@`2|=BhPXTQgIUD^pnIbMx?N$JE1dR&Rz4Rt)Yt^bV5c^7?+^nw!>!-aexca^4 z*XHYne~F=jDD>iXE{9%b;K2lX+;T;Yp>6^Y@_~W4D`5m`a>zj=i1iXB+UjS|o;8}x zTM^7u)u+ANaxLounsLg=1wPd?!B`@A^ZmnFVf1rtzWvnNN?JU7vtq`LW`bdWbQP1U z3Rkptbs4Q28DhJd5X2s@o1&4a5uE#`TWu5Rt{8Cpp64xd7GJ0z?rv_%?`b~K-)B;K zbusvT;GznG>H(?*VbD@X<@sY#j3eNO1Q^<5DMPW^>N>Nv%&B20{Eo&kOqGjhXQQrP zzWuf!aaDSHv7q%rY3D1tLI8gmT$s3Pm{b8B{h%@&?D(`}X#Wd^hcdPpHtT36H?KU`RShhr5bTKtSy3s=?kr*$ZHD=}J31s6CFujCUs< zz9!hkdMK_oS*@3*vd*?f9b}`L%`!r*VWjK~`{tB=z1~XdJP$vKLuML~L%m{Cd_06rqAy{ujouyY6nyP7KPw3bL6W%2RD-wCIh zE0R$q!1?Wcj0wcPkT|^G`>&Zw1p4zJ>%UYHg^(Y5S5qsQifv>U24p?8H}o^QSQdz>E9M*KyF*SsWclAZ+ZWGldATHwU!Cu$)cqs!w-Oxhll-|`H-oy6jsHSg ze(n~Nw>7ohvWM&L?t9xYt%y!!J2Yl9G)JAc$RJ@mK#29wRSd1(mhjy)?C0HIaG4Pd zf8mV&LdylA>lp?n{bi4-rLUJ~m&@Zm`_DgEucdjny-yVL(8L%9tcOs5xVXmAog=8U z1&bKmN85ST#2$bPzvmF3Eo5m|020@1`QT4Nn4Ul_ma0T3?0})=w|RzMq71O~G-gVR zijl%)NBna`Vn)O}blJoEM@^%OON}6Q8E%k55VfDy{jmxwjNT*&CM0LOsQf`y+|6;Bk}9E z1bmeW-(i>>|D;vChBhS#_Vj zAoBnxQy1Q|@kzgN%2jtma_ltw3)G4ypWbzpjqF$Y=c)yLrq6y@5F+wOOV9q+*!Wq@ zsUD}IuXgIxsq6f*26v5@A7dK4{p9J>9UtikQVjHdNN%d$al@{|s8?;8fdilfY8*XB z4)>`E=d)yP4%s7(=s0e}w;mvE*@FYKt^!}WO!27kjcsU+u>s+ElyWj#eo}4mgTRyp z$0`#P(J2jYJHrj!d>4!V-2JMu`se-}dOM2_lov0qNli2=FE)1PSeEiUk{2^b_~;(K zlv_;iD`lQ_?gsb4;kX5gO*!tquNwiNp>8+0a7qvm&5TteG8|M_;9j~5X#t9Z59=;d4=A`B-d!ZA>g zt3jHK+aLM#vh|}c@cRMU9=eUNmd1_0rYs!^BoVb_QbM+p2Kv)qta>(|11Hqlg*(n1 z)@c%@C%!WDe$APQkatU!YL280`yVp)Gn4gQRW6?wZOl4+(qNiUk&{zUV#!|UC*U|y z&pn|%@Tv%!O?XM_d=*>#%qGd61?;RvJLQr8CSO-4oL@)0#dh6#jAroLP4V^~-FGq~ zJ3RBYJ%Ja<3tpij!3-M-ME6}y-#)@ge(JMl+L;D=ivOWu?y^^Jj=$kikQmVDiuKix zpwPJ(>h1R}+rT1xzhI8yIJvUa@BI1aW*KPSfufJ*){FJ?SKKN<0*gj6&cqw5^zG;v zI;Yvnf2x4b?(cV*gB@^Gou2k7jF+xYKWz74JRzxe(O~g|Gm~`F01$Z>!;$6(lU>Pq zb{>Yde1}2j?$4jf`zngRYUX%PJxx?W$TB840$a}>Nwq11F{4H z{Ca89x1*q%Oo3ZLlvv{AU{c;#yq&?~z{nM9sUz?;b9|c=DOIxEh6y1?J@qC(3)O;m z11e2J!)vb6-AN1i`1ttwi<^2+1qAsKd{0Z>qFNNWuNG_jD_nni7=r1PpYo8O5NvA) zD8!Lv>kE)o00*&paAJ7Qm*F0TS48Daux7^VstaA$7Od_;g3UCigZWv+f4{0+T1`nf zI&vP9Ir>*9U*2O@T!y`wG#l>S9R;7fp`L386Wq#m?CXvI-2fPO*oLFGCaVrSz1Rib zY2H~7rU7RKB83$e2;DbNbunG$gb6GSC(plj^tbtt5hbp69pEO=clu7yR_3j0s-)({z#Z43ltAzPw+A zdGP(Ip#c%6^JuXVW`IVvXCrrOAJ9lf4*FPf@Cok51+OmGeDRDDMZx(aM77>H|GO@XfDm5rX4cG)3pbwQ>OLf_1?N z(yDi?5JvFQ*LbgOaz29n%Q-<*=9Zj5vkx4!9^S?X>XTuftILEDvXGayykK+gY|;En zf9x7Ytrg_x2@M+K-@b1(m1LoGZ+_H@@^M)YrKzoaH2c-|H)hVOd40EhU|qemb|+zB_lDyD~SIzaAd zdtpsa3eFhHmBE*L?bTkB)K<49`7`i2vi(uB^C4AQ?J4{k_Nk}sUc^G^y3ENJIl|8H z?8obQEAQBCCh8_Wk zr8eOocW?Tp2Sv&aFeR{F32oNCOjmtaL7tZ+uZtr5K^`Tei80pdHHX9Z$OBNUIg2G} zhmmNc5b~jJE8<9&uezy`%+P(-8te!pMN^;m^TYP18Hk;2VNM_RdZ z?y+vGb1!`_E!mW~eBQ!^Img~COu8Li4nn#eNB~_-$Hgw?tgJop) zG~EEB2D>&@oYZTFzS^D7k-rQi^4By+Ll`jnx{IKy^kW~_J?mVa4dXyU!)w8dE&UX> zFGFuCdAwM^-Agn6becRae&KVUV0{3%E;hd^^lU!zy_5VK{=mO+U6x=32bLukiydnw2jx)B z@iX8YN#xQ z;!Kj=Cr&k#Ue<~xdISrUW#6aQr4uB+85@YW@`Z2UAgEb?$zSHjFRfx$60x<*QN6ZL zwO)PgMiNU-XhHUCi4%uL`x~;CxR1Q%HFN#wsO%{F%dcs5Ao(p(3}mCi;2!@?}{}oZp(-AT3B^t2wY@u@8 z6Z$tw#=?zP*6)QsD08UT?qAWkLxep|EloY*gp^5^#RkGnh3JCg=w7vD@((s2qUAlW ztdz2EjOG^VY{-!Ed;Q|eFCI;v^RIzr1>c#PyF98wsaMv~XP; z15LUo zd;X>FptuL3+ex68KWi-_M)GcO8L%K2+qVe;Z%2&S#Lg3esdN(M1&8Etj{ z_fMLTP6D5->2&$TYn(BW@j^F4SBQaSLpmk27D8Kj>$i{m*Fi@rP$5Q}?fuLfV zV!@h`9@DdGJ`d0Gl}qXuBD#LY4kERTla23 zw^TpLwD(2nz5$k;ND2sQRu!Q(d=}r!oi$IKqpmb!IojSi<4>?!{t2eD;GTzcseN-h zow?h<3qYIPKd2R4cTnkcm4gzrZ603OIsHzdTUsC8?UlW|H!3;{clG+_lgh*re81_5 zpQM%%=0BO7+rl*+K!nYvFOil)RG;=Z(1}z%emwQ2HM;bGUsepyvkl>~GZT?`_$>IV zwi@4b9~Iufai|H$7)uWPN1O)xrJAx-V2r0+T;fV$9eW%)XyR#xx!?i-YUl)q-4qf7 z;BpoRH7=X1WLV*GUG2<6c#)^Ff_@weQqsz`{F2}G=`$`D;me-e=lgbdJlb({UoPd` z-6fp>8Yu-U88#mOLJ7Abq@YzCp>NI}m`R4&aJB%sO&e1R?~V9Kk!$iI;^IBXHOG#V z&Q~Lk9ytCTbn;|p?`g`;AY?F%aH?v#Q6g zL?~|R2v{w_2bfqE4DyH%<;+8?w1m2mkeMlLK{p<`IHWMQ>@}(|vf=H~V#ZbOa4#E? z6lUFm4xydvcAGyZxG{w80*mteRD7}E-Z7=UQwaeCH=Q9=HDYMfjX@hGsL?&_ayN?rM+*j}Jf^Qi@K&p`!rWh)pTNlA0uj z{@)Ll2(+(AF?9+l;RgsH5dWhc_^{>T`Q}la5L*qjNF6z@Fwv3lotx6^f-D5YVbyF? zv~vor!vTD{6TP|zbQlP4!xZ*NmW#a~L1&e)nWJ-ZDP3_}`u2`LgEg6U1}L?vtE-FQ z*?9Zz0EsUZEt(>e>-dmX`m2*=BJ0CF7Cyhbe8~p_5X`(==Jd+}K08h9S)Y{CDJ&p! zkSHmx%c=KW%Y+`gZB8r~iCfiH37Jkg$Ce+*zaY}{%7OUZF%_s%h9-#26o8x?J0!R0 zFn7R7qTZ?HHE_X29-0F@ zO5Yxk-wHrON95@VtxGe%$+zpGLKNF}t9wnPKy=9(+D*arr#D@eD!@PuS`Nv6oxQaC z;N0HpP+>#SITT`_7o4-Z)%NGk1*J8GG*FtOyHxuui>b$ilXOOPvBWag2g$}y+Kd3G zXdquNuJ{%a5f)8#B2dtKFZ9A+gGZw}MUqOp(i!_&@{=4nBCX-a+&j5&TY&;23eg38 ze6OO^c_#+WaSHM*S|nk~Ie&Q0Dlm=JY-kqr6KR9@BF~=l_TJ&H1ki2hFKq@-PalH$ zUL@6tggj!&N8gM+U#}J@gklV%_>0=wyVgbWZ)ox<`sXLC)u_~39Iygiq_u>b?_#Bm zA(^)|`xZS9w_FAXd;PO4ZOy!b3Pb1S59{+y6#YolJ(UP80LTlq>npCUxs8q_t{JX* z8ys3aC$n_c?9Eq41-~N#-)5jzA9g7)B)lU44w+Oi6E2#Lskyt~)q|EvQIS0^svY5} zpG~-*{f)RdizEq0RVAVPL@@01mN9XyG>vxMHeD`Nh|*R$hs7u5O=8${ecMWd-iuN~ z$9|k4;#vo<0MN6^0Gp&~&O!I!DBD!dqd&=OH?C3p*!ncetS z0;Aqt%RboKKu{LPeBc`&13S9wWZjwWM04bM6yE2J7C~InD1a7hgwA zLlMp|=yno}m+^wZ#{`;8;K8a*+Bj^RktH;4B`yhnCK^P0nNM2!{6!X^joEWT`Zz7W zg;U~VFYx8{Q19>As053jH{B{fj4TL==dRV5hB+}P+-;7iOME;cM(7v><>+VLG@8ya zqCDH&QJN5bvY;*KY)1`Kk9->TC4)V6yVvDuz}*(@$#}E;JO*@tP6prH@Z-AvnMyhk z<8|kMN^8h+N^pPBT5H=q9q-f_P2KKXa_<-j(7a9sdG}{y9|9iL^RtRYXgm8x*1P7y z_Je~MX$Y1`-6Es;vwA+SIbMXi6aj#+WAgszpU1Fc`o1inLFK_t98|*YcXe1-bhtHI zwuGjT#%!6m#dKBciXEH?2pqRD;vcq0t=0&TJwM=Vs9^_v_lQ@X-#ahE4WR+$Sz3?z zLA4tjhd`I(^qpDZea}7!vA)v&yfOP9v8FY7t(++E~IY8MB3lt*`=8ZE}PPnP; z^%{>9Y|cFKCT)$=_Xox0;BVSWl#ziFyE7T~d;ntBU=VmDebd41?bIXy7DP+;INU=o zpEKSu|I8t8GFlPd6+fQ>z$3CPI1_dTy@VU$#HakBbN}w%iad)Moe-U%<>i!JbIIlg zlz}RG5(%NOzbQLrqZUj$p-lz&4QwLd=?|h^%~;XhU>W*#U;W)@CRd>NRuJV-M?yaI zuj`D@8pKC~Dm}eLcC$0+#Yb0h3@UbM=87)&`}yCYYgV*#F`}T$bGu*-Y1%$`o%X+e zy{|~LC{*r~kLl&f?*{_c7CMK9DGGa24}L^qV6+Eb7C4P>_S+r*+}^3qU$h4#kI%eU zMpxYTjMP+vEwIc6jC#u&#DQzy-UXM#Dd;DN5n(lV8U3w5EYzK$5%~E?vq<_#Gm}gF zX;0B0-o9;HoOJ1xq}csLDSd*P(!T2Mo^#dRvS)L1}@3i!lWgmuz*Iy)*|KFaTLSksU+>CEmiJ**|Zob3D z7~we2&E@cCF<{dc?rP^-2zMfy4ikg|82<(4NNR2YEQZil)ny~U>U;Xb4`F8WgJ_p+ z{h~Pn!ragSMy{22TSRbg$MCR-@7bd0@u)I;a_+R8`bX4G24aakA-}UsWa-1Qm0)*# z{QMad*E>ruQ=)i^ddXH#Z~i2sQT}7tab?8A-hTO^xfMIus3@W0_N}yx5M_a)ZZ+h& zp!AO66r_J4LVXImMS|hwJ%j0S-z0)Fw+zd;BJQurC55Fu1pCb6l+x6!2}wNMhBOLK ze`4LSwjB4~3+Zfl=G{`h)5d*Y*Y3eJcv_XA?WMQv45(l*8LsFPfaL8!hMy&KCe8A8 zO(3rJD6R13_e*E$p&j;8iL4HSJ6nMws|=QF4Py5k&>VgM5q;A_dN~7~7JLpV?m2G+ zF+X@_683F@e{6}#rI+L!I|c^ZCeQz~+Hp2%_A1&6d-tnW5mPSCuUNmC=1>0yULJxv z-}pu1-vHr@m$#+fb^b5s)zVJdba{?aRe(8k{J8nS>$N#EYG_)QC z>~1&ms}c$c)QoVeIRW9QjTPlvd+GauM79@6@|V|s;IQw`EGL{oi$z8H#Q5EV*baFp z2ArIVYwRHr$MxKDx?ka|P3e#G{OxR*?}f^zVY8IOAgX})3_mT|@6T&ddY4DY>7<^< z&p*(Ua)z;u)yJQSyuR=B>C+91wr<2+_($+QVh*_apelOj7q|!FyYDO&T`j%j(b;K} zo1gCrcx4wu+deIK_bIo}0lsSp@o|5%Lb0T%NmgoJ1JGO&i9{f^_E#!kY;p}9v~T`; zGfy2B8w{~3jyC}=uRkMX#U5L0S#fLn{^O*UnNCe)zMRU*e)`^!@#p!kVpY}2YV4Nk zVo2d9Nq9J+<^m#-zKL$-a`Ozoo64xE%4!u(qR!-u`NJ3v%9TS9zi(?qa+ii8=2|A< z;Pj?j7KuE#-yc3WsMQ;|ZJrwc1orp8|FSGuvgH4)wI0?wL7dg*qOY%BOjIS!^a%rQ zIRO$z#cHz?Zk!ip!dKcPT<}5JLWBH_{s!bZ`nrn}nUMRb-k?i}!EQ8Nr4-?I^$Ep=7GR0*iA^{qFJ1L6q4V-<*bK_Aza>AT^(S{#O4KK#CfVgk~j9K zA*GtR<6C!-ka}MYnT*Qu9E)d!o7lEEDY)X2pr^MGfJwm%g_VYI_uO!~04^`0D;XTCr$C53tg-1{6-V3rf#3>IDe#TZ z1=`^B1Rq{qKpA2k=*MSxwaj=IPr|({muY5v!Ml2Pvjf&?qHT*fLBR1&6QD)yxFX{P zXWh2aAgz>n7Q2-&Co0kOyWhq1;e4~;6#z)2a5c|S;Y@hV*=(45D(xI|_!m)bBCR0P zsfEwUFO_2= z_FgZD>7jTQIVQG$utOJaZw2k%wjsS2J=jB9abN=an~?Y7tREiM`0yO4783&l98XH` zGrb@qbsX@EcioxQ9dGwH9nLf*@>s5E8PCt0p7^PC3KP^%2|G`fmeZxEeL$Pg&Uv(f z38R97dS~nVm53y!3kra=)Ly}gdq8>9%&wGhpy&91b=cnES;f-j=Hwth6dJDhI%x;8 zmo~E%f^tkca@wVmpss1JEz5*D3MdF%$uUJLuCy2WsvV*@t#D@hd+;Hbd-nhOytC*E zxoO)|pSI5%w-=%=B_gDIGckH7z>4g|G-zFSDQaX>Nvi*MF91*dpa^&$rJs(!-Nq-% zhR^>k$WgcQ65cayeB*fMvCBlt0UFs?(rW7l66z526dUea`?>D`k-KWzFgU_q%iFLd zpD-UoosB5AD<~U-#=)J#{Y4N|{(4V7$ixXfGRavm@Uy@tp5ojBT0Lij!($i#IQ4azO!jd@BC@k@-(K^44%VwoJCkT6B@7I_Ad@y z1AJ3j)1?&Hf5qPXwJ2d(Vm7lt+(aRCB8gx^g9|^64BD^~60U()TU_LCT5uh7VI;^a z^}gR+Xr|YE;Z9;q3^4=2U4J0tFnUC2F={S-h8zSy1-PAhvD!Bw60y%4joz&hq9zo< zdlID-GJ6;cIJq5gJ&F!LfI4ATChxMTk+Bt8UjVzInFrJr)z$k8;IduO@PhOQQ#1bl zA!6_$`bM+v+y2-E#O5M@WJKaE0<+yabE!*d-9T~@;8JozDm1xaHimW_cDCqhef|15 zzVA#?JQnIL3#3#1;3mGTv<6FU+$w?j=&cbvmA|n>15^v`uloS)Jhv;hp$0;S(w-}SUM}jnQ1VG>0O+GV|)6hNUD2u=^td_coSV!!Q zjJcb93ec4hE~k|1qu9BIZOVNTqt_9X(#A>WYviM%;B91E%TFJ-N~m%x%bYnuX|bbt zJ(+3A(wPfuN{GAb`zibO)Bk^JY>7p%bn4Fm(O88r(%pTl+evvW7kwdtevQy{CQGJB!OUp4VFqI&V5#U+-pfU6h zJXb~3d5j1*H-w_3m!frCgpDc=0Sfc4R?stI5pmvH^-;q;s3|Sq|I4AJPAxb&HW>GU zhxTRCvYL4yF^_1;So-gs>ivALhg_R-DD_Y=bQJLF5pS4jj91Xaz$0Zk4uaZygGB-Y z8&7{evQ7%Ndpm&IAN8cT(Tlzqk>0*;ic+d$iSUp->jDE0&BaH-D=jb^Kj=~E~a z^5jaqM%CVWqMh;|`g3T*1ArmVs(O5W{{9(x9uo&VS*T zwz;`p1OzjH$=+8*`J70F93K7i%Vz)GIT8JQ+Gz2GuGg6}gv_dh?eWv6R>l79C=JtN zo=2)JfXqb#{ypiZH+DUTM`B-n+A4AJLjdn>S?4n|GjCT>DcSY+TP6p$jO5h(`@bkP z9T@3cXklVoK?^*GHdLL7-hYclMGfw4N;xjN>()QQ!m29N;EL~)?xv3oJX4PMH5$3! zvxl7_EJQ^L$vESzg6Hlp+)C+t{n9`>qBX=TM^ZzX0sDcxl0m1iAl_U50nF`%xgXjx z8`xgqN~A|Be|uK~wVqvmUgGSTCpc846@{at zPkv)|5w;vN0o(_gA~ z^r)u=)+gw_(A>Mj&TPJVO;v1~t<!f^Au(Mv%d)_s|N^%~-;`g1%Rvj$r%S=e}7fdMYqbTh0nP$Gn@n}7ezn?{wL z3N)g>F|)dKz<7Ipx44>dfzh4A+G`TV|S6`YmeM{~-51`7IpS&|Lt>N$FbZdjVm44>xoqycmJ zQ8u%B);6cEDiQYW;!jrENkjPQ0VYV;i3MCTY-?XHqUW*QJu{|zvttjhcvM(0OJ^Hd zNN9_q9|A<4+C|D!Hr4G^4$9(HA5t@0^Yg`xcC0y27#eLb1-)ieHO9VvAmZMZaJSFF z#|Z;a3-7awUL?*exxQWtQ3r%NG++%Kse~&Y{w7fO{koI~ z)VyFrf;?0b)KaAzv5{@zqD2c<&4dw&-$HvyO1VzW`X^C)7sNhkYMzumEQCu;*b!}{ zc+e28aB&B75~jVF2PJ}ytHxvPe}>?x*Ve8V741aTpuKw%EGWRB$iat7RSPTo4JC5r$J@S%v3B^Qk_HbUl+}Y__S{4b*q7n@-GjW zmt5;8{%xmQ{Oj3lRlr%oKfq(*?}O1NMp*=R6Cj!RZsKM|UL+ozEhB8G6BuVJrWelz z8o>>wDwBxi?%?St$502C#tP4@2)`|~Jo+ivnh>wuQ6($E@Sx22`nOp>KOXkw%DMCA-DK+M?@t04pfl|~R%Jp? zA_n>>-n0-uB>l`!NS~Ba;l($bZ*?m`JA-0SyYh!#?u8H@Ak1#HpV{vBjY}yAV>l?C z9OE32`Qs15<5S+V6OPW9Vpgn>B!1Mqcqvlq1n>*a^fE|FN(P2f1!f*JY^#vDw8dUv z{>R_MF1STLsX{l(@+Lo*(e5R(<8=>rL($j+Gt|qKOXfGKS8X9gZDlBwfv41T6EoI} ziU{&~x7sfU`0I8p*i1a4-|%IlnUClxM+JqYD5y_m)%--0?3ZL6X6$mjo#zc!lu6yvx5_ECWKC)TIuIO9j3{Q;dtg}J7cgE9O0Q0` zC~k)AT`dM`+g7khbaUlHw`}CxvSrH!u9dg^|5kY~V!`HQwintxGx}2By?Og~ry#3t z2M%Yn%7B|I3RI0xD*Wp892hkhsbPvf`Py+*Z|TonUj0fH)?ma44sA?_u9oNjvVc~@ z@gmaL#01%Xf-LLYWMfL0T{5PI+N^$sVYZ{UJx>3$*Sbl)%{eELeZ|vMlnT<=vk48;d>~i+V6CM0>0yK5lU8q5<=aX z@5ZtC$Bv#ww}}h3Mcy*Aiqv@`=CHA)Sa?H}mK<3Yt0KAYikicHRGom|Xjb&r*6$eD zRc9s@vq&rFqHSRg2%xP1ngy3xpOJC_5Dqx=$>HBSETD@vLR)&B6Tf3y2=z{mo&E+* zo29DBq68E!@9va1};u@aO9j*g!z~z1`k0-LfUdHAn9qlonupr zGLUxf_uWq0PlZf+ho2H=H@xNmo=8}xppOqLhih++IYwM1>_#82CJ4%ppFA-dJ;)zc z!-kPI#2A@@n4;BNY_!%M=ao6q_{z5MP!hc$j2Xb$mtRIq%&@%#VaoXn7uaUcKH=C+rJa~$O>t8o!N52PdTC}>TG|xe9f;aojgqX{k znz!)X`eh%&r^dH0)?9jn35+m3%M=ygL>uxACJ>;-)c(wRSYCzBI?N`DZzK_G{R#l)MINTSd@#~^8>=uxoQ%q_!)hYyy2#q7nBgV_02XLxTq%m8 z4P-JIR>3@Xcl~vt(}`R1&alt(8T*+=N5ZEL(qn{MYZ?!=J_pvdXS?SkFtT~O!-`w| z65`pF#NGDzBY!3WL;ygp;r9?#%itO~5%vJjd?pQOfW|*9OY4%P)q-Cq^r;-Z`Ll)YO|f~ z-O3Ow1GE~@Fo;?4r$Ge~>mtevmNzeFnqe(PImJR$Be{c@f^a2^Q4S2@|`k+ zlv;fT47MKbsjI$sV0+gp?E``Dj^})xDFCrY_Fse0OK}9Db@`hv!Xo}hs&{~j{e$5^ zez2EV-@Na{-n^nKQX9$x$DLzRIr(fGo0_89DW)om82yV;5BR^k`P=uSuqGO35)4+` z-K5N@lSswBKE4K}o&(6QrIil2y6cbgVhDJt27mS;19Mb`14kPPE3B18ip*f%qHQ@=&9{I_n`Ie;OL$%x zSMqPJS_O`DqxJ>pgm(^Mo=nhUuS0bsCsYiUsuXNyj0~cr@neeSf5|75;8NIgEWJ!@ z9u-jP*R&8oWYi8E+f84OVOLa;?q83gEZyd!^mONcKK@K#U*!}asUU_&)$Ag#;ue;R zLi<}Sgc=iL?SD1;Fq5XHrnf0+^bX1BazS!1L9eS+`>n=6fmP%Y#OXG+Hq>}hedj5T z$ox$bcb~|Cr&`YtnPiTX9mkR|$!+++rJlpSd6qI*w&f2BwM-pUaFl*>Ix z$IGj0p(#Xc320Lj(@%_5EZAIpimk_xGP%?c`w>h+w148Gi5{!bM=R)M2c*va{eUjR z0UgTWz1TLzc-JI%p<8c!#jdiXE4Odo4pTcaKAb7-+&}(_>viJn+3q4{s(otTpPpJq zCY$va_fFa_B`FkzDSUz4UR;TmJteVHI{vcVRf)YXMQyT-K6RDKX#IX9HZo#rbQt7x z>(^(X@~PXny(qRymq1vFmWMjmD*YbVsSx8}uA5#8h{^JlKiB-s#No|j)3-y6a~#mEh_G|M|E8tdLEC5iDAZrrPKMo`nmcUfcKA zv%l8ZQ`U>agPrduVnJbINj=guhNP)D%fZ;V5k(qI{=(aKVT{w|es8iFOD`?Dwx^zzNQcG6 z#YII`c}lbm>JS+)AosYyiv$C$zeO_YMec}l@_~s=2xz{YcuEOlTb{?G`3&zf0s^X$ z5lP717GFkdl%+#d60`rV8kh!S0|Nj5YkE*bL3*%u zZn*l-)LdACFqyB}Ms^-mPn5MhOn3Wj6m!9Uvh+rV1)Z-L*0=vtMI%cgPCz-x8hH8F za&`Kq)?fsOTX!v-*;4sMIajg5adrci&t~<8)1MPZm%^Va!%g4K>#3DaZ0ykjr z>bRNE^Ql#+R^0{`<-Zpiu~&RJwULd(jWIA(?73Ys{jcE*dtz;dXWn%>1t$Fw>!sjr z;F4NbeLx#Xm9q0gAE~^9XWv?@rc18g|CpsqhGxkLpR>e{wMXA<#osMw;N|80>!z(T z!=@cv|L($%ZLpkswfI)@;aT8e19Lqn;xF- z&$Z^$K_z5)A7mOF+w^#)wR~TAXo@Fc=cuHl1Zx{@>8q97+KnU#s(dWh-&YrDjMw0c zjy>O3*E5&(;-XD2pY1XqaRXIP*jn_e+DksRNb7>ss=~q;K>sPFA9U_9h%Y-*vhBC) zn!J;-&1eYi%b?Q~031Qlyl>jlMI)pv$!~{;hcliSz3RILB>8PeM_}7X`?BRW(lo8W z$cGUKs%kHH6JTf2r`<5+Zo&K%NY&#$m4?5@_kl|3D029}2H40D1lTT>^0PlOoK0NU z*4YKNAEx&nEqwgAC}11_k$)S;U?44s*|mx4<)!S;tn+)tO4-N~3P}Z8=pa{IQ}oCi zHwDWDz4xluS;qhAGYrbok5as9II7^epTf7YTOG_AkB=*kiB4fyazdn7k3Uy>;BtJZlgMFTA>=^(# zhwDj&O?(NLkb4+UX?==X2{)z;T0-| z@<3mE)2h)s$<3wPwr-7msOF~xDOk;NrFeGu$0%j7w>d!*_NZ7S9RN%X^CQ-Is`m;o z$*KR{J6B>+f#bXyrWoL1QT3*p}%|)`n-Hk z$uq@>EEe7E5AV;}8z`{)m_H!(A_Fs>10#3NUK>LvjRFCg6O6<-fWqDs3B*vc4FVL^L!Mc)SGvj9oc(0&VYpA3 zkYave^D0%3dgg46Q#tdO&pq;|JEoT05V=~uH^bR+(ms33i}U$bnl~QUeVH@d*L=0S z(bOfL@x|fi>28!>gZBL$mBpshOSqSp=AmB_B(+3PFeLH3VIGD0e&K4)$h3mduKcaV zSu#()ANB89fMI^YUi;iM%rSKfMeF#gpz;vM(dE1Z6&(^_?lXy25&O?r^Nygez}dvO zF{R5U%3GFxH_zIaQe-Wgx@_m9;{phvTYVvayALNaVz4HtAAe2bRX5@8t*Qu=Zb&Hz zcqes47#BBMUA-w^D<;gXf$>CnVELMdZx{krEc5e_8?8PZ zQd=xUufkY$$f5WBRfX*Il1knM+hq?n-fE#;KFh|tMohZNI8FX!b$yMP*_5wUl~wYw z?&vwqoTnV7@=CUw<)} zK9Fga(e4KS9eRYG55OM5jk282jyXRYF{I}4XaC=lb4tM^13bnu_dj8CGevTNPIGS9 zQWc{4v^@eKx$4nF2PIfIIdcIYjRq7oy@EE%6(*(SLHBs+61ZuFg-AJGRaV+#MBW$~ z>8;q3&zF`cnrcl?jQ;JL*(}8l+!O1l*}(I!kB{J_QYB|gQmTVMkd(=_)s98=H7y$9 z+2Z+HH--KDCC@P4kJg+zbg1#sIW?X_n2X+^-;{o6p{(QnkQ0jn64#4uI5GIa(|OZ- z1*TIE1Kx!_I}ibnW!#Ybl0=iQW}CS{QNH#VOM-6P{(T|TL@}gV#ATW%L|i08r6M4R zo1E_zn!$;m-ddg$tQ?3!*cD?rm~S`R_w- z!&Ks?8?9=vv9ICLG8>>8o?#T8mn=cm+B~{-Bd3KC{lq;>kn3obr{Qi$kOKEs?@Rq{ zVUWJ>B|1hgzdSiut!(}{qB(GwFP3hR!ua08^@0*+8Pe8w8^vq{bZ00#J)Kv&di*5Qz6MIF28Xq&tqzmDBQ6> z3dj)|{6#uUbtxaOHcC%CCnn_tFZKe^951F78eCyOejLec)z`dsDKNW!p!^A!+ez>&l-7GKqT>!xB_AhB~fA`D-8Yag z*(1i;@zyO5pG@rFuMw10?(oR%%eEgf(X6PLICsf*$*Xdu)0Zxr74yc|BsSa`_H=5I zSTbDP+K{pT#Zoi3&3#c9^z2^5$E>wFWBKqH$LT1i*JZ2d;g&xluL66oASL z=Xo&{N?6*`VI8Tx=^B@=RyIavQ%nP0*9M-kJIX@R7o}l30w!ld5N!>zlF0U;x6r6tERf8ExAqa z)3zTy>TQzp;31;v7tYbgu{yGq;hUrnR~=O7?|Z%ST)ytT3t_tV-(d!)4 z59sVR;B=9jioNOfvVTM8!H4%_Id2b?1=~FxZ|6?XHLyO?c&pd*b$Igw_nnqst+%kI z=~Of{<4sA*WFOXRirOV*ys^1e(hegZ_NCdd0H6lVZfTu})a?S4yRp}H{seoz{iM2# z3erf%;Rbz07gQ&xuz;tV3u_N{`NUilyvKLA#QF%zr8__?cWr)H4#Wm|uR#|3K*u`w z4Z3HBa1SaHYW%9o$|MAqu@@F=8qf?w&#&um-QLzZG>lH3P_8gnB%Sm4q8F}l+v-mF zARnKYnQPqs?kNWM?2~yL9s2uvS=0H;LN~siobcfJbd{Sax?)eLp&xZ)NaCT@Zq4hd zhXrCc8AX}w8_MvRs=nzHzh;R>*gZ=Bn$=wIieCHQe)g$jj)fFapnFQTwzXY3vto-; zvlT{fFF*=>`w|Htq#MVdsV{AkG(aIz)y`Ts^f;ml3!XpUMZ`JJc7jRG$;oN*X;2SC zD{;t;91~NAi%R`>*v?HpKxg!fGL7A%K)4$%2fo)dX!!Bt$G^67TKS1(P@&AuSVirw zz1#We^68I5<&WPyEpONnLQUm7`EGcnapcL5^F?;9x}9X!kZg4}%)TM-K-zVuA&vb< z&+gFMn_IK5J!M8J!+2H78!1eMex%5*LSu4dP>va;S6GrUFfYsIw{GddpME;fEq^7^AY{2l@`dBNAl zPlt4)EJ!N61S2s#&yoNq%ck0-}$)OqFyy zkMBu#n0NcpP|1>)cP{z(?B?%M>wJ0I6SMUWw%j|{Y~}Fg)vnQ%yW%zcnPrcBwDMoa zMO@6Eg(C!5HN0NE@(8^0Jm%1-K)0hHH5jOe3qcrDDVd z@OxwnY_&K_&u$ZGO-LP8d>P`?$oT(w!p^Pz<2FcMN(Ixf#A9fVpsdEz9H7h4_cc6g zbLZ2+8Ik$4t_O|1WppmFb$DrjnCM(Y^@Dw<4iFPf`oX_}n6$4L z$J3=JR%zd{Y62+x(Qiot|J^aG=gmzG4e8y6!)e|k>Q~()ed+^3TZ@n7FB#va7@<>R z;W1AyZ;NGad27*4hlzh!pN{6RMP7IKS zC1CUT*@X)i?)gf>K)3W-0s46G`~;~G#um41pZYCX!p+USaA8l+p*}U+_^_E7*7xfc zO%5*xBULLMYR>4u`hzd_S7+6XjkkA9+5~Jb^>d~R_pDDUsVt>)a%}%xWmi@R9Lqz-ZgtxLhl4orKrQ`fMwAR6h?o?sMjmJBIM}l^OJ%d;#v;CzEP16f{h+Ti(>K5n* z)5Ik&R4bF!dESc94;I=-Ixy7J%eQev2WiJFlHcE$^kvvM*U`vQ*J1R{+5S8eam5st zE_SIr-TRso^+__K{bok8dOOcNe3D$g$#a{P|{>Akmg}-JCvfDJUqv7Uv!#ubK50fTuBcE=C1bK?wZU^g*2H zfI6-ct5&1%6tBzDadpkvwLCNAsNb7GWeusDpF2KDuD0*0U#xUT%49>5zY6u@J3r-l z1Jb9*+T>k6)p*=C3JA0xHE^$s)i`XLe`?#4IPS-dIZK;(t;R&!wj4Rq7w05bLCk`< zHji-DO{FUem;}+rHo7diw5iDrTw=9Ae(oYMW|iQ_viPzStqGj%*rL`9F-*g6JeXmmiNIW;vg3@e+P72@4YrBj2%%sKOd zS$|&uq^bp5L`$l)$l1j)WpfrcdOR&v93SRS*(4VtchfA6SKq!~+OlW6oq@-vk&g$Y zucma@*Q_e#tqcg)k;-|M(DJNSKvw#WU+xmS>#k2P4)V#njuaI4Y+ac0QSpPdMTY~! z+>!0APsVE`XeBspJ?1oZrY9UQkcoj!c(-g)FobS;hOZYq$&OCB)Vq@|C+}H1EG!#M_QAkOtG?Hi=N~APsREj1|LMo+c>xc-EN|T~AN~zFXM03(S zpn>LD^W1-ZIp_ZGJ@z@zeLVNx=csRg`}cc?^{%yE3KL1xt-!8CQrXke(qaaw9ZAQe z?~_Zn_?XjS_aZ?jzz#)yLb?w-1&Cs5C}L?3slVO`;aN-9v#Yh&VvcbE7?jKp?sFHK zYbsiMJELrZ3ijG|Y!>V2Yo5QEp!2%_rd|BD`->*JMvwMRbUkq0-)vqrqO|vO&Ox)?5=Kacf-=j)-seUKp}X4(qTR=rx%i z)=T#membOa3CPm5@l9K0&6MQ^o8CB9d`ej6)_AR@zlyf+u$IiEp1)UhXPmD>Yr9p} z%Hf6z)AV#s2leio+UtT1O7b3xsSszLXU9btFikdJ6Mm;}`q2L^JGfj~pftt>5IKHb zSFvwlS#jfyzsj0_^yWQ+7ZLd-P8wh=8K4N0ycCpA{@zbvT;~5s6+hmb=QHxz65Syj zR3JJ1a}cxu8!h9&o1>`e}1w^`QnGS=Ps1o&nexP zdp{@8STFLifuYuP&pgPzEvp9?JG#EjzPI0edk+k9wpAO|-*lao_thLfGutJ0WzGE% zd6SNL0oF+Y^GC+SjuM4kWP;(Lw3tjwp0Qqzk2~IWlZ)4JLpmHCIjX7CL(E0rmEp

n!7c+&V52{zHILJ7lLb(%ePy^mfl90gG3)afZ^WzEr$U5$NbWX= z{;YA$rHB*BlK1BwKg8r*d~i;lok1#P-(HOb)zhxcoFX#3odquwWdD4rJyTjC%f2b+ z>VlS2Fxi3X2dBM>BW)9-_h&FF$?Yo%(O}FSl67<1rETw0Ft(L^m!(>T?)}S`*^v&n z1e%EYqRU;L+n3keru8JORxp~b6rjZY5nzRT^a+p>F|OZ#_C2it*UtQu6;$c01{_+1 z+%00D;nfc`oYvrH}(GnbKTl?Lo_tCn zLX?Q{>)2panwV$G~r6$lvVEKL+GUioG*?OjYxmfL%->!r47$d+7o-?dM*zv4UjeV_fcmhL=SDFd6D zxv8+Drpm!F69;bJ$()*;e)e&mk-ca44*jUFZ7pl!u7}yrHJE)aw5Wb)SpSCWS)ze} zN|Fjy*!yXhm2sQt74JRAAmfEjE9fmKD+P}jYTqGNF=5)JbU6<;3&-jnvlMMV;k&}r zc>k5@I+1t!JQQAF=GybsQRP@@z*vo8?MuP6ApRZC4BWK-m>U_Ks`+PiUro6k*f=v7 z-oJl8wmY~0Y&-LxW3>>~RA(EbuAQL+{BNQ}Grj8$TTG=Z=$#{z7^^A-ipCCKk+jqO zo%Nl>2PC_{=UZx6`l&=6EgP|0ZgYFC4tkUG3nL9ZkGif2;F`~S_EE8@e8#DONm1&( zPmcc3w`|LXE0e?hpQrVgEn8N7y!u3_R?FuvjFBWwvCpN{y><0p7#CzlW-GVl1~~rQ zo?UQaZn({3tw&3y{rF-S`4N=mQbBKmMDtDkbtT0l?6>$FqlH!2cx@XsL0|ulGjWlk z{-&o^HxKzX36^)WX*Tlnb4wn)bR6lwoyGm>b?;ItQegp7MRL-#Y3 zHF}T0`O~SLf)C=@>FMb`yykf&K7i#q?H9(Z>}*W&`LAKQh|dL&wnHdsS5sH*8-tZ> z7SgDJ%eY@gh_mCCUG`9%G=dTsP6^3%KM2y+0*`hC>1o#;`kL5Wvo02iG=N{(`*J00 zdM44MY8)Fq=p=_cMB56C(vHlN=^j`^}lp-jB~&aiLmB0rL(agSl%3T}I>C z7X~-43jT%U`W&FHYTsu42l!Q34F9_QwtPi?N;{LeVp8M@3wRvZ_Hc=eZ72$GW7#an z{vZ0r{}%;$pgDoP{(PS|OyS^3R1bJKl0-~OHfdmK!G_akx$n~AsAMi5C`KA?tW4vSRGVjmJSJAyP|7I~ys@7)FxSD%<_VjM4as5rwlMj_?_8*2FSnLi+ zXavHdhoDzQ@`kl&FKQkXGx1R6f}JyM@M;bi-<$0LzVv@6mUvsV_Y&Pg$J2AkKChfl zV?ulG2&G;@oeWDVZ%!%YiqbCGFy|33;^5IyWA|)2=2z06aZ5jHvONMH8JI${s0Em6 zSVeFdB<1w{$D2X>iw5m4R_HU}GXz+S_Z4MF;X`P$YW=+MZWrO-$xz11RGMN%Aw!v~ z$}+6dS8mFj)Df|aJ5bkf&7trfoA^+lFvr{Tr?XA7(KvMb@eSFY)Z+9x>fMsH)@7#M zt0jL;e?!L4)`jecmG}M9&hO66@i@x1eY{+%Wia*PL@F@3xJbcmnB|%DI1ZOhCss6_ zdiT;g_~x2!cFt=jO`wtT$jFJ7)KD3Getc6#7N?AZX9)=QtDF)?6KC_AxkO~lgoORH z-VL?oI?Kd3)OZ=~+;*^S>&J|-nfh(=W7$Z1K7wS3GA}fx9%I?zBSG8#b_?NVKJBXz zI8B}34D)zk%CMj*Ox4cG7JG5Xa4Nw@df&d_I^TaTA|dWSN^_u0YdFLeWHq2@sfk^j z;nr@4E9vC zedplfa+95Ch7iDe=eUx&!jwhZ+m{yxovL>DXXnc+aWrK5J~7C-+KkCyO+ri3IltZ; zPlS7WdndYoMs!sppWGx&3?YWy#ShHj=v7N?8?Vyrt^+_WXj<3$S{p;{PDxB{UJsr-Z%E@>3NS> zZG{+=@MNe&>(jp35~Xd3W*xg5l*H!32mPbxRg1HQ#{bbREd9@lt;0|^fuEoudl-x0 zJ6wV$HFHVyrjaQ+{?HL)m}K|Y$9mGdTVlnu*e2S~LN5Vc%9yVOhl?Xo+4wLiz`1aEes=X)a+9Pz{3u_dei zb!f9W_s3+m_XR^*#V5}#;~2pNNZm}>@#pi~ouv(TNKxy2f~UrBEMLgn{I2@>3Jj5_ zVV}TvY)k2K9vvfHZY^;o-p4vwS1(<6{2xWc-(#^JL2Pup2jsVx)h}8CyK&SoTo@K4 zjNkw!+1c5-_@``l?1wmeA~oHxobYgAMP%y3w4T_z^NZemg09{CLW}e0U8ptTIX9LK zv6;8GH?awx5=+wf5OFNqf-fkl`yZWk+im%?>PAYG(!yBtw|)F<88V_-9l=A4?7N%G zzvaKrU&c6FQ(HS9AN%76m+YHiCVtzC7rU$C7iVc?OM)cg10(eGzIFcTJo)j%gDs;u z59Qd(PH_zE)J`ihGuX9L>*+()CrY%U9(kyGSC!M83|_uaH%s^&J`N{=;kmwybGu0> zRfKnsDLn5POV`H8tuSu9R4oep=fLqE~@jiLsffeXpP^tm|wkRyivr;D6 zh$&<~XEZLR?h(SAe!uh7#dAm0KEs9LuVMccDsw&`Wg7EcpM9&YLi)00%a*F!;a$IX zINa(41{d3WAvom4*ov&`^m4;uMj)!H9qe<_d4`wG99<{%!(o*TFm1>@C&95>k<p)tH!==}h|S_9mnpK#gTlou=buA>-q=5FOQ36>b~*0H zul3p~p`Jrxl3#mIdY7>J%lGvRT5~RgW&TQ*Zr`}<`@7pp1B7H2T4q+@a#pbNFML4IZH^tM{sQS5XpRPH1mPz}LyK}Xv_8pRIJ)lcybdTLT!LnW< zxgZsQrwH!rgTxLsdoS7w`E|cnkfgjgTDW6?NVZMtc%16qeft#eQ%Nc9WV(K8t&bf2 z*yz+O-{|zwnRmK^ZdV}h-?56&T-8>s_!IjJ&;K0LfG0Jr{gQmQj96ufHND4j-$DEv zvl4OO~q&PIep>+OnS> zkmucNQ=f~A8*6uMXJF0RsB+X*&E$IsEt#1C*}8bwLRG3*NOgL6T3{M;w!1H#7GPXml6R3lbH`}!OvVF zg~Cqs3bS6umEuE8v2P>Qoal|`eWs?aY6ym1VcmJE+g$44!HB$Fa=WB^%YOeZ@O%?| zgcXwGo{#VxG`2uMQ3?`&mAh9of{_(nA)xUegM&|Y9)_c$2_{@8PHr!~gyB$}W`adV z03pbQFcAxqO6uz!(9z7WADC+Oz}_tQ|94Xu3nyJ8I@X0B~PB z;By9GGWct0(d}g{n?A3eRf49F+=-ei9S?%hZfQ71@2D>25xUR^jo##%(qbBl_KVwKRB zFQ+L?H1cvV#7!ES3}_@~2*;MQ`E&d1TTsHBQF}UT57rfWsmn};i2p&PN(@`AgHDK4 zOMxNizKPHQ7upD^69;->?HLAIPg56d~3EYm<;GrFVcYB!S4SV~i&`e;CBc~y5 z07{>pmd0Zfe(L{O@BbTV-teo;^a#9rRmzu2=<5OEajyKO!}g=SGal*T;Q=!SbcB(? zBn%DUU=N^Q%D!KF{u!vt0QynLQX^;q;f|6QkRmb(EWaVOZc=zF{<-IZ^I;9tsS`+787ee3=foj=j64L?Yu$g&<< zIkT0WhMnfxGmU^&l%L4xVAoyF>nff9w&ncKSgD`X3Ue!{xa~#OMj1v>L$<-W``n-6 zZ18fgoMT4h-1&_T!`Ak;md>XQF#Ffs!QHZZ9*`Fp6}T7v9A~avxq?;$;kF5eca$b< zvxIc@K=V;kGANSUF2a-~z0~0K`fjrG>)3)+i`tz?a5dLydP!nSbD?-9G~^{G0(0WA zk#UCi_a>XTIriM#TpzNbkwoaij@G12CvNgLJR_luYCqfx-#cE(DYW#*9LQOMi+iMA zsXrjtr=psKyR)mSE6uXiH8(5K)q@iRDmo>Be3Bi4zt#LE<2jssDq{1^<d#ke>#SGMITp<|JuHKKgC%iu@LT9h-?4h{~At0_Msxr`sP=}Zuq zIXe%Weg7wHH*wF+lvkRDz9Nu1IAVd!jBCWcZ|6&5ty1Iz63S%MG6JFWXy$v@!BN8_ zRSP0TOrR-Af2_0s%6)MeV#Cgs25k4d!@7&z5|sk6h3B7A@c-PYVVG438RY0W>;RBM z_|asn5C>rgyioiWOBg4=xDkAzzT*%Py5(13e?r}7^CVD;CFpfNL%PnrXAgnFQknuv zf|O3an{$bg6L`9E-p7w}2Didns{^+DfbQEa--JZ*RyQQ)9p!Qn$2i7?9)Qv@n0XO2 zKYkX1?TvXD*ao&UU|@)D6e_1jcH)`5Ttnodi7niVTNR@ZVu&{08xDHV{`q!ciyWI83`i7B_!f*dJ@%I|S@~+b{5cabb zMA?HlQ4Q=;TZoB^D;F*z)9UT;v;#5M$A_ubh@0=hOB%6g=}X@~Af5dsnk{0aMsfx& zs0FFYfd`rVczWH2qYB< zIK+ed1tn&KnN;gL@}MzdkVj`joh zJ9iWjSjVX5=SC|=_V!?J{l7oAv?&8rPwP6sh4#8?OQ8SXK|nl&eryouD035m0_GyGJVB95+c z$pM?d=4;wdh)H892;}m32?GePw_XR)&DyOfvY;ns)rIOk`5E|n(RP87@RGvcBkD5} zw+1ct9`f7|A%HFEFATz2+$SR(?CmwtJ(n>s0yWHFXm;vLfEgpNq@?7l@7PNH>B;`n z8OyO0>?R6CZ2aZbW`{vh7xom5D-kw6u$dx0A>b={+aWv)e8RjRE)^;Nb3Wbuh>Ekr zhAAHvXWxd$;7P#r`)&ar*Ta&d&*hK(KI~a~Vvqp|llT4fNR#S((9q|Ov0=433B*?g zHByioGLesVpv+PE1LJsO3Y8?GO@2}3Bc+=Y4uw)@Un8+H$9NqXaHglHXGUgb z->Ba*yJCZs8QG4RGsK5j`d$C|12+vi%weUoxMGH4)W8xjPI(<19DLxM&Xk6~o1dH^ zCB$jz@S;=Yz`0`6e3l;V!^fNE1%sk*FjST*8o0;oC{Dl)g_ht z^4&LAy3U?W7`fMv(5#o?JVu1DC$#NDUy_nUB23PJWyj3Kl!TS4*Mj(M{|Wr&-6I;1 zA5sCaTioGLfdqg20AGU2)9bFg9UC;DXrCBrN&MF{K?m0o!!gz1F}9i)I%0Iu-+fL9 z5sgJ{Xca;$%xRAtJ&Fa3FL7PrEMEN4>((umn{3~X$B!(P?@=3eXuS+^_TXK-)wg8z ztn7gqAjR%ZnRuf5DZOK^}%+73ET*84vFIwezp4s!iVgJ&C0{*u)eIncnXY ztUU8+-_Y>h*b9d!tVdmX;AGSC83Gy}{MvZ228>G@8X8|a`r88<1Tl$$RR~i^M-W57 z!n0`;Z^1wp78)l{o@D4-FM!xk?@(zPZn-3K$;d;^N!XN_@eu3d#5s$!%;g<_zc7Ht zl65Ln*j5nbOdmBir%sNJ_QBTlz}o2$aZypV&98V3ZuL6;e&CnRf0BwJe27MB9HC?f zy+`-za+8Uj=vT9WM!*OFqRIHZ#DN`wk(Op)>S_YvR)m>jU+}NIl|h1}s-083XG9vC zw@ZYfx5_!luwn&2j1DQ4)#J-AR>Tu>eOhR>50Vb>VaQ?jV5{6%aZy(!I5n{Btfxef zASxBZ^gJZOcCXF;_3GwkL=EkG^Q@G4)d#S24|vvIV`LQbYxWomNzMs6h<#IFwgf!U z>LMhh?&T{8-#w%X$fhfg=76gX%PcEQUnm^_zvW3mWWTJnjN4L{f6e8ZoqcDZEWbH- zt;>96L|VVHa{6V*4j^`sF+-ytot47u-FEg)0T>iIGqrD`8Jt#~Z{PNhwvP&(N{g}a zQp12UP_DeL(|juAls_zHT{1Z`ot1Zn(kDk9MC7J~`u`Q0u$vvLFe2SZXO1=hx%2Ka zifvkM2QD-=e70dlQj>b;JGAGbV3-5d zYswr`Q$xRI^NWCD2Q#$fBin;0IHCNl(;pb^1@QXFY$Cb`P?=2i*j%jZ$Y|H6E7mRj zUjx;>Lid82mR)E#5}nUl9{NQ4Rl8#L@UyvlkP(4RuEEQvx*I-{JX3qk@y&JEsyXhs zBjN3{`=2EnQQr{%Umb4}97J$!gSq_f+$miQV$j_qEgdARJf1zHMt}LTrkH}=S>;cc zLruYn`(zQeCVIxrcUa5BTa1QVW<(dQ#ad0~sYmvjpFbaynVET@23=R_NMiEx7ZhI6 zua=3HiHQs+=Ix!Wy}d+qd!hTf7a|YDeWWw%nMt;yhVxQ~wHF^kkI0b%^v_UNXhc zQT3u7MFg9`7J9UD1Ffy$tdX`GQMJ6(PCc&*pJ@12E=7^2B$NsroT-G+V;s1Zud)Z` zQIrHLMNDKX_RAVVw=oNwa`pSU>?|Dvt9j+bz`E@Q7cYNW%ZUw@9f}=0=N9t=-&pBg z8yb`?^;r4n@Ea?L%N*0?2cU(;XsP8^4mRt3V{|;A9_3W8=8%lyz_;QUnB$r+^K12r8Zcie|F{!tU z0z$K{r$JEOKI)v=VYz0oFs>x7)^gS6-Dj+WRsiiN?|&VqUnORk`siTe$if{>s}oi$ zZ@R6iFAPHwkk~1ls4-->|CrSsYIySjT=<9upnF{BWZjCwWE{8NZ+%#C4A{B(9p7_h zOPA)Ke?;CU;S6+4kAa7&Tfy?5gto(HqGfY3uk!u>D{>(y%Zv|2NQg;6iVU-^#o;kK1WIfzd;Y~P z9dcIaGWR)()|LL}i&Ken5SPJ{n{^KKzODdTEdB>zULU+;|8Nx(%Tk-X_*L3|GXD_wb zI92(UESElm@eMn(Ukpjy9-U==#1+94tSR&vNqSmD940AGzCHJZZ=~Ac=!3Cp&;KoX zWwOEpW)cRlMvptu_&l#BZLU2#nevHo{i8K9z&Bt8G2^`59MGogt=At12M0%Xu#bxe zrlsw%vC$-}RoHmp!StIY~Eb`lTpPec~m?vwbQ%t=~h;u^>%s8Eub#q-^T;@=Pp%|62YM^LJ zLR=yvGR9X^gzG!sjtZ}ODKBf~R&+3^<>P=T!0pr6dS%3Bn!tNNg}z3Xt$8e)+UY{z zcWAz`-KLGOWCmqW7UoC-kQ;{VHu9ttE3W{${mBV`0_6Ri`o&Py@MD!(oa)U3i+;$lwTX;oxs6+EKmg|i( zz8YtkL&!({N3i5H|NXqsH%p*Kl)}~L|WRiHC!<-Krr7@-JUrIO907-yrpBAegGB zUi4yJ^!1yZ4_{3Vew6i+`T+6v_;H;$@bdRL$w&Dpon9t$U}15Mc#Vwna9&I^f2MET z{X5ro2-`{O?0na__a=mFX)Xe$i_Kxi$z7j9ZhNKw!_l0JuOufFtly6 zh8X(EOEE+3bx*Iolxc}|-1SoZV`tS3TU{|zkCv~|;?}j2b3rN5CqtxNBkkw0yG}^^ z)j>=7wHq5)lg>AvSlo0Pl-{jjpCS)!2k^AG}^P^H{^RpnbR4{)Lv+2?(Go zSFO7Ht_S{TUu$b6_UzHlj*^k_>xz8y=8Xj1-9CRf{^2jxhxmSjKjh81O`XvaX3yrSyW& z>A8z~u<>A-B+bwcG&aU)-f-9d5+VDfmF8M-ZG!X6hr(|P5pF{Ja}~JFt0EylWCPuHP*5kr zq#@~S2G8$$Y@)LDR$kgXY;+MyJnQyfd-edwW%_Rs5u)N3^H~-Qz+<2>r3+&O1nmB{ zHr4dYCO?AcB8V{@lgLFfkZ(w}agc#8D7{%(zulbR9Q;oxCNZw$x%bZU59SD!&d9#&4zobucFPIlV)ZN+&WY}K^9)_epzJL` zTbJ>qn)hCuN^x`CRep&u?t_P>S5y?Sa~;Uix^Hpwz3)&rYNGqAGmga=SufBQeOABx_Ir&xudUb|-JNt0FWc1n(_=Rm1!Dhj9AB zPVPEvxSV>+mMxgB*kSRadQ_QNp)b#|hxjC={>)(m$CmwSHFD&_QF)HgE3tGvjFTIX z#w`Kc#YgHqC`a`a3UnrQYG)&fD4z6t2bY%xNp2bI@g083bK+6VBRlr#iMdi%VdYJ- z&ht%vu&S(oQLn%mqvMv--B{&)ykotI6>#8@0Y|0rXS;{kV)W=;s-Y9K29OG54-z3*R z@jURF!_e4Z`?L7nMS=TlDi-qNYU2`TM`m)|^|#SaWI1&_vuBMJXZsrcBkOtJLbO(8 z2GfVf?!Y-^`25DT0Q^VlL*)Ft&5u6oRrQHqRs;X&z`PYm7L2G}qgP)=D?{vy7yIyY z$Hnf?TVjQLh!kEbf(hZeP*Iwm;`OCh=W5chJ&fB>lNvj8S1>X@k~*L4c2DwBaYJL{ zvFC*OW-0}S>4#f+i>EK{mw)+Dj_H;IdEeU$p7)=I=s&-kd7U+}d1m&J&gK*r>lHF@ ztjg~-(;SUd4wdI@_a8qHeUGa$Xq~u4r>NWX43}%_9c?4-qbXfQwbxn|5;cO=qmDm+ z^19S!6lbl*o2uZ)(bdjl!?GQ#{Nn4z8rPkQzpzdJE}1b$??y>!{ea-l?7PwrT;Lrw z(D1_ameEPlnZy$HN(7VGw^OhcZr|GjOO#qlQfC}QPvO@7{w(uqUb_%Le=oEWp9F2W z#|l7tHREQPG;~IUJ3e$*`Id~vJZd&f!av%geqGZeTYSL&0N?{yM{@w$p-?m?grHpM zciWx3w+t$#!vmgr?^Ma@zNt<+z^>KxKla34QxYHR|K1au}#x5*Gq|MBg z|M;Wip3ezkPUjkerOZx-sfc;9gzdAvT%DoO8}nz8QQbpk{zY}+r}hfLLi1zkrPa?X z!g&FXJeBk^(%=S}gJ`d7Vtt(gFaiNoP@kWFn8IT%)f`3CU_oUT zW5e%U^b5j+N@dL4riKr4s*SYsyM~F*rX2APOu5p(5O)2x+r*&b#i5t>o38I+^6}Gx z13+Y^DmEu(*bl@hR#8yX0hZ+}V!Zhxd#8$$>nUhcI7JLe*yaXL z9MDM`FcUf|X%b5UXW?#tOk-!q=>)9VLV6QBO~p%0ec5ig&vli3`LYj_qu#5K)o#7E zeZT9RHih?`r$v1IwO2yUhf{a#JoPD5s>N)XL&x_UUmZux?dW2wBVQ-#Os1IKa3~6q zI6cG2Ui(SdAWJ2fenT$AU87E1b_#cc(v8cX9LD>RvTE8HIC6CmEVS9lWLFJi-B*4A zep>KT@|CpK9j7O(-3JUz)@w^p=%sOV-OsM%Q4VP+EG#T9SMsl519E+Kb=99o?SMic z{h~^afkX%W?mn1g82f_wOc0mWY}~^=%SjCLk}q@y24vhpH~bpyAL1n%{l`)^PaLI1 zED8LNM9U#{VIwp1sb@!4*ey+dU%%duRm4HRdm)n(*DK7~MPFi}rWa7@>EftlKi!ku zAvL(#6~pv_;QB=UbY_+b-eV8XTbnSs{!Hzul69IQWj+Xb`O%YQf0g_j>wB``u@-SgDg-@As-EIpnk;eOve$;mg+vfrsJ7x4i@Be z6!{%pV4YWge~tIVV}<>4{?l4G0yj5)jlaitSibbx$K$bqF+ZAVbaVwRp6-#}8Gq}B zET8_Xb2scu5xcO`?<|fl%zbq!xgZmD3zG_6q-(3M34QX;#oUi{t6n4|)GmAoV^%3; zj695lBHMU(iSk6!TN!!W&jMvyvUvq?n$+1G$I&R%G_VHZ5;eZ%|F>~1e&o{61$#+& zLwH%M;>T{A&(tf{^OhOJ0FeAS7<&5V!itp*ujdP13oK@r-dvPi6=OG1v%*gN<8kdp z;e*r?hphyuliLa~31DE}(K z&v4vt=K6N?)c8PaDEyz5??(*8FuE$}*J$ad0I|s3(WCC^;bnGg03!XQ&NHwGs!*z` zU8qVJpV`Qr+{RR77|3s4WxQMTGislf%xkmIx@!p=q`}#DIwe{;Y>$1{L^)gfRlPIq za1rVqxZG79E$MYv*$~U;MnYol=e6k>>n!ZahA~AlDTR{bC{OxxQvbtmZX} zDzoryV52jhrH&Q%>59ZYv6_Nh^eMw#k(!#C|9YlQsO42uR1_9=Aszu-@cu+w=*{jc zYEsfY7FoeZ2Oz}$`-j2=4}P(!$aTc{6dh3izyOAB9jlsvpuK(o8ZxR3M@L5}oRlbB zNzxTse42afT)gG;wjQfWWb13J=Ii_vl4bp)34VcYAA9IsOH$09_Ad90ZdH2Nd-la8 z>~wh_=8N#85uA@)-o7`s-&_Tr)w?L|n;-xW-fcLWDpPt}2LybE zQ}%beyRTirGi7jHm}3Bh{(`v)%ZMO*PUq|h9V2X>Ua;49GAsB0apwZ_{ZLev(arl+ z-ygvjbCnKb%M7&Y7-Gk(UmfdK1vvsq=Na$sA_C&3lk#Hd+K+`hixF19Z7mNO6Lep-d^qPWKkeG1=62gOkG`_h3`@emKRP-jK052b$jv*)xd8nfXPxwjOEnL^s{CM1ds<|NQg*DpoV{W;-IG-ITy?9S|&S6V@ zQj(OXn56v30I_;xhe|jJ0<&7hW25j@yR1MVWfG+c(=}=Y?R$$0ADVZS4VMti*_KRV zivK{#KpHz|d6FaAjgFIciTP|dG=AOxI7764X##(16~QPd;S`^qEe3>^+6Tx;r#T!a z?732~CZ!W*jA(p>_i*zV9sPvGVljsYQb@z2*8;lFPAy!Q8*$a*R__#CR19z7f!KJm zfcnu^xsscUnbCLQPPTp*ugEGw*NI^40{VvJw#F%qC%#~0lyD0F%Yh?V5iTw+mASIX zS1-b&uLubGoSD+l@Lkfcidcg+?b<}uPF7LQ~aB0|6ZYsh9u2Nu

GB~Gch}Id?0ft;tjHj@RvIf!TarTWsy#Zi~xr+;#%xo}iQ3&~y$+rif^ySOPJc79zhc4I{^97F!T4x5kydFBJ;xmNn@&C7_r9KP z)IQGq!J@Ek74SS1J|E3ytGDFPzIgE>vML+{jNU&W7a}Gkuo;Y_K#gDe$j9^Y^Nk?i z>#Zf4GfA%qVJ2)T`*QGzs?jiLz)CoOn`ZzB9JSaRv-?)gfYbzlK+VLw3-obIJC+Sw zsewMgjI8I&+eB$)9aZJ#*!0ZIcnZJo^NOTk^6EdMbe6-{wtM*9eD5iNQ*lkH8mA@J z%Q;U@?E1z!D4XTRCAT=W|5QU{l+TLYLVFKW5)&_Y%)_5vRBQ&G)^r$T6=mL4am$r@0I2JXCgRUC9=_7!@2u6544G0<4lS+mVgbItLo)K7!Pp5fglUiv6Le068X z9oA(N^f4%u+$Lr$HyKxZS~c2QC{gD#`&_<$>kF$w+d-hzEm@96kBv-0c=>1U{DWGa zj~Ec6Yl_c*sS#fX=d2pWp8+f3s%takbQ7 zVcw=Cd3=s4LAJ&PC`EVW6biadk?BF8bDETD4k1z?)gsU1h`f}~d>Q6uB_VPFUeD4m z_Z<3Gne_H7@!=9~2b4SGy}>`t>s{2N?~1{J=!QrpxqI7QJRI_ zB1g%9sJf$Ex+awLd6PS%?BXYux#+dZW_>}ezqOBRXd1<4oy}LhO$Bv#a|-X%#s?OC zCxDiPxj_4x_wS$SCOGN(=K0V?g5T_07o&Q3Q#;JVwhQ-_#ZtH`#MR7EAcjGRteOs= z1W(0$#*ha~v!>t-n7++RlQX^VTP4P?dHX*!ClwehTRq;M|my);$V; zHBc}u9G;btBDjg4=&ee!kNi0~>0oQCzRV5GRv-DYTi$bNsE;I$# zy{t1i(1?_|XfvN7FEcjs@y@Md`NexO?T-7)q)Tqrk&d^~dOJHgcs}qXJ>NY~i0)PQ zx2v@9yT~h|PA;Mb(%1;|DvVfa4jzD$E7~=Hs+3L?ajK7~w<4vh8DXVBVe}nH79VoZ zw|5!)Nait9{EiGUF>K_fgiEapd>H51_ zC~YwBsr|jjH~fSQ`gkx1o4WRM?oD}L_SFSF z2ZbQu(A#)d@fij3+s+CtI_|I^D0%L`Dn3m55>N8M%?+%d0WMma1b9XkKXmAk*pa-@ zzDm;&MG3yT%2kQjK8D*z329iW8eZq&1^`*CvC)SkiZ)BXu5iUZ1xI&>_Nj&VrLuBaR4Ek%d5bjVRWCWHa6#u|vM ziYqIB_NaTfj>fNkyi__Zm0Q35AZw!w;?{)3rETS&*Haamk3~PPrMZ@|X5d<%r)ys? z65*7Suyw!6r4Qjx(7iB*sa>aCQIqasZ}!J)+pG`P`^Hlj+GK@giQMyTr^oCS=`>l9 z{_@~q)$COa3yd2h}>xbw8D^$?=fyEb=>D&cu0#1PrG>;dH!nIr8{+#;sMvfJJTaJ9en zn9qory7=Mtl-IBQLX8mT3JW&L|9r4pfvD%DsYk+WVz&Z2f837!eUQPtMcb+-f;GJ4UV`)}rk^Hjs%WeUJj+b@9yF1aJ2tB#BI2c-| zL_MA3Dsj{4$5B_krp5P->K4mryw{)BJ`tMTkYGN*&~-q08YA4CX2o_l40}s_cB^9~ z%5@-(m4$`n(wx%iH`bLIhd;=$8?B8|kG_}nhyA5Xdm#vv>;!qH*j5coQE?k6RT9Ky zKfwPPoNSDRV{Ud3Q-v}_=r_Mpmaca1`^y#!Pmb7sir%S-b|;SighfmS9mDi!)5DVx3;)EATo4h6^Z4PdId-VRVQWVQw`ry{~K1M z91hwppbt%7v_zL}tB2U*?L}&}B?ZnmBt7ld=eA?TZ1?uFf^gKoYF*BoJ1A)B1y#O43e*pLN6)L8N7sB?8 z&hHr3UpS=b{APo&&WzE{l9&_Iw*~43dK=xezkgtvX4p_1wCiS_{_XX3voAge(Q`&t z818lGX>vVW7Ts=3^sx+^AF)T3_9;K)@#F9u!HO)+ zC{pu!aKL4z$mu^8?=)+DW{NSU@1++5nT(&ghC=K-KDvUit&2);}a4b zlGY~khxZP|I1ik8Bhirc?r3Sq0ml<(1m)(WU3JQd54Ig;0pt6Pn+D(a!9iEJ+2Srm9-IE4!A6P6N?z2_)t#B1#tsLf z65b>2FeqbDv%}W@!6E9#aMA`JzWj%exQ-qz1PwI)=HJjy-VR!6yIvlf&)_s&Xjw?} z%C$uMO?wf0d zOaeUm_u8`Z>mG{RcEa@DZixr%FsvJ6<8*xuEb4}*XF-kMj}B=_5Y^1N*T~mqerae> zcut=Ca)LrROQb1oYK$iz9QOcSw!|~gboB_FZ4g9OUVqWM?uq@Lkd-dWnlZNAT0X(y}rw9Cy<|4_a9?M8qt1eIa{Q=mmmABpD2-^R~e%4 zjdK74qhqf*EYv83=%wGrD|~Tr!6h)l>KjvNAWA?DC1ee|^kB{3RqE+=FIN#K8)#J%Fa*K|lneiW-H2m?&Q! z$GHm@>;?wCY#Fhl-SO(xBep@1f`O_}CqZA?{*3e2qZDihA>um0oGJJU=Q7kpJrBkI z=};wd7O38dseBU)JV{;`9hyg$p92J8be~=C*tBmC1bM$+^?x@6?w3|#Nzmy!o|goZ zO;E?!39XKViY3rzXchdmi}-dO28_-yfTx}Ig6pqbEw9^&7=82McnQEuWTY9^i`0M6vk1%yzt z2TN|Yf|yXgL`ug)xi_v9hY(vtBnj^-&i(%V`=O!wSoB2UxbWb(h!o%VJ4Ps^4S0!N z;o;%o82Q9STesxsKk+4qz5h~LV+sz!JTY!Ckw`YHXvPUe7YfpmGq{?=dopXHB( z^VZ1Dar@E3H%Noe6^oG^J>Ec~Lm}e#5E0)gj9-ipPUpynBdn^Ke|{#%Fpd=(!dh{s z%t3^w@K%=sb_5gbc?fzD&_#%Q8ljWY(cG4N7TskAkVWJv4pCzTW0#`8gvajeD>1XW zfHmtt($u;ZfaUY?MdJif=<-jT;C|&*ezRxka5%pvLHUj2pa!3D1i?5~^Dl1(00IK| zjQAHgPQ;k-ZN@HvMAiEu$TzDN7QB>)+Ku}Y@WqH8hpD~j?(B?-L&{D@?LoMWz_cAiP+{?Q_x^ec+krRYIUqp%wbj}9yFGYXB-Vdk~wVogM0|ZCul^#Y8NH}xpTPDDclOA~=olxFY{3Cg?jKAJeW*rXg& z{UECUOFEEW3_h_(D;IT1uj)D4Q)KAsUh{w9h*{D`{XsR#9D`9|*1?pN6!mDRnuUt{ zk>P*;eTLLTW0QBYh?(elAG7Yj>NFX?1dVuqCv(TwynggiW63(9nUGkgX zWikjYgWZ)YdP#=l;yr)bs2iG_PnQbkzA(sLK=+EtaRLg{;uAhzAD+1Pn}+Ml=x1C3 z+En+~@S;=DN7ik@-Te=^eQs#OEE^3v4!G*?ag#0c4-@)=g%zT%xDniMXo13~f@y-c z5i5l`re8ogK{NjS?-9pZekTN#wJByN?S@fqY6KGlXNL>O7?m3}K-+1js>gJ<-O?>N z{h)XSV(->)fA&X0`W)+jRZFMHcw!_bwQ>c7%vlJ#`zS==Gl;_8lPN~+h*BiI<#Rs^ zxst;}2bJMUV2M`4?pV<8LHG~qg6-T4bA5H+6Ce&oXkxs`0um#;)P<1#Je?Sor$G3}%VttB8-UxF50>@MD46&<&pAqJcnQY(e*d9_m1b#4j_yZvqp z6wa7oB))UDo@IqexP;`i_1(?cvk=^gnwG^hU&fpj(Igui&VLW5MCmBPLWWWqfiXS; zV)1^olC>wY3K4{6U{~jcPmDZDtSUz85sf4?Mop>w&4mSjoX{q{5xqJ&7%F+tUhMOZ zU!;yx#1>nSuZmmLYTAKBSX*1)-?*0o<}V^oyxE`XMg(1(q{pyl&z>SEA)dxJV=qv} zr%x9!P^BC=^M>i5jOUl2hX%aPQ#Z-*Aa;A8q%jY?$_U@Bw!y(O#F#;X@`z)y-f&0b`}FLvwTX(1w?J$~#yXD8PjUK^`^G_<}l;wUCOxS|vgM0q|; zQYu}H8Hon^78h0H{WDK|VV)NMHW!%z=o&&tH04mrsM88)4*r4IZBR#VZf=f8hw!OR z^yj*bka~kP27x5t49qEuaJ#l~7GM^zb>ETAWE)nWZ;YvT5DDeqo_KA4>#st8twPv= zdC<^do)v3>rOtZCr3>$JQuys6LgWtx5_<9kb(ov!J~8IhgS8f!noi!E?dj?t5Qgq` zTgd7+Nc@*4S0UMp_;AoPmXkPxWkw%xcz}#x9=K<<@kawpo~|es9o;C8sS?NB>^1>| zBQnS7$&c4zG}}GzDD0 zS)sY79R*5OMmQe5x5#Vlt?g8=zuyq`&_2|eO#G#u73e9qV@^3jKY)DF)KBbyw)rfx zf4^-<$@AZJP9iJGf6DJZPQSz_F;t<(m3j2G65~6yquYfeiQ~R$U(~<6&ZAJOo1l*E zy^2fHg;mt1-}eaVrpEaB)$-W{VMjN1X3Wmc8f08Krnx+-pw;&{kC7fO*>VPkY9Yly zviw1!%;X-{A%nPsX_2Er)C4xWMu&!e18B1?jmQeQx0Ia42a_4M5wgJrhL$3h*6;FcY)QyVvw*If;A5F9VB_a zH37zzEmIt8CM9f%+^WP4zFu;hXWd`a0PY}aaPmgp3hQ^FDqF=SLi(FDM7;B*7w_)= zZYAoyB(7C7gxQVvVN&zSMP!7GS0w^%5Px?6o(8y>r;Lr z;VfIdaSF?Vy7k^*YMp$n6CV^8Sqsi?-64m#XwRJ_eJdxHgS>Ls_Hteyo~B7SMuldmX>xKo}k85>`U6E2&)U>s}VRFk&+<~r^M35 zN4+HMg9=YpR@Pc|0-DY8kiBA(p06;^*NJ$G5*Oi)bDw=bBIPs30Ng9N6uxgvx2&mN zm*q7zNx!iv9AByhPiTi@`){(k?_8}yXv7@dF?(x_YGQu>67&8biu$}YH?|N(eF2{e z_EkA5?;N^c`;lZo}6 zSkPN3FuU$ggO>VwDN~ydwZvmKYX3k;Xwq*e{|g{znv$k!e9e#c1K)ga&qx?23Vvwd zDO9abiGu%p*zzSkiMK{L>c0{BbWFK>XW0W2!MOd}Wz3aeoD?!xf=l9prH}-&bC^Zp(T)dJlUpRj+B_o5@_b|k3q0|W z+3t`EyxH;PwLs)=mj@gb76O&P>nL<#^xJ(J!sYV~m+&EPU4Tin7#Td&Dg`}Ni?0e% z14ua{mgc{_-B<1(b7fnus=K+lHAazKF)L&q$7kHR-6dynVKyf75;Il)+8scvr|h}l zVNCF7F?eJ&v-%~m!$=<+Nny?mVt|7k3kVmvI6lyVQ;1dcTM?_s&P~<&bZ3RsV6KSmWd-!hkeKK*@ma|2W;H-hUH1LL#oQt5ebgT-({tD0{DP$Q>ZlBLg z^zYb~UgwDud5w*GacJ<7db3c|nOIt01I^+0Q$qw0LuTq^m*+3ubIRR4h=+hyjem{X zp+kp^H?6c_^!{#S@;5BR6hQ1KekgSww@lq813D!rO89#y6u}>e;`!&oT9H3c(dLQM zxz7DO>Zq)nX>V=klw6T^isJ=NB;_xbEyvwTWw`iYQ$qucLuPPzZm!J%P?slPkVc&Bx{C5t4fkU2Cm@yZ`<~>6>Wl*mo}0IdtfdkIzs+Xez!3Jm-rGd}uL?|6GUGYpp};Jp2*c z){xM^I6U$n|4A4;Fkc3>R>tMKAy{k)P%t(&c5FbPo+Y55wc~y=f9z`P5xW7I%N*Dx zXUMg4$%L|Enw!eMgm8HIcfzDE7u8?n5H2(jloW(7{ZNR7`>UqsA49$XU~I207g>`6bPrfS$$c|-jB=PVOLNn? z{HFU4!xQC%;Z4jhXDoxJ zDF81!g$fj9=ADTE3hm)Wu|j<8&oSOfhYpNF=S&unN!?`U7Mu=<4D@nQq|O(Nz54M3 zTQ&un@5qx*?_KZ8@!vwenE)m<<>m;8wbn8-D@(g!Evnl!-5)Si%+1Y>Y-k6d5`%a7p(0T5EX_y0p1nH=vk53phnETVIJRJLw_N!^|um{I-Q zwP%;|(&DR|(hBAP62@O_EK7(|y?~T5U#){N?do65$avjqWKbw+8$K=J;-3pn7KmLs zqOHz_x$ivpY9oMQcD)lUgiw}l6Vb9=IC(0}ACThC?s`ljUst0G5EHbkB69={Rh%6u zZK6keEquw3XN3?cq89!KB7^$vCGQU2E#A)f?Tyn@<%I#g=I{l&KmJ53sdAUoX?xEo zIp=k`L`|8#kTl)=KboSZ1}e{_P$-*DLiMxduHmXhu#(H`CNvwMcVuv&Q@^as3f}{~ z{o5;Y!HnWtY5e^MG?l){*;>7b1FM^f!7~&FKb45D4YonQKA2MRw&yVh3--CUuz^0HX$C?#uHfjt(88d*^`67nhlQ>UVb};G@7A=pxg0$BpNwE%X;0cCm@3uPBtjp8r#_a%2yWI_Ms}0ZC6U&Od{z z`1o-T-0Q6yR&WOmxf3VNEWu<&WeF#!CR}C!EaXc%k`g@LY{p9ayp1HSAyvU8X~>s? zNgGIIABJnh`@aX|^N$o+QHpj=`}Fp2`R7-perOI}l1Q|b8&*{ReAD=8CByj9HHMxK zi8W_SEbc=z4y>VgEPV@$oIjs$=1iI3X~bUc@jw-Y8+#AyNcTA3(DF$aT3?8@o1{)9 zpL^SE$BsIkU+;_c5PZ)Z#h*mB8P0|${#H{HA#!2U4Y!68ow3C-?KE-o5~qjU&qMfU z`_Z#srRiBx^wx{`4W})t5rIEGFfZC2F_^wj=je1nEK971Az)E)$v!Yr+LjMrtvfu{ zBa>WZJRJ(X{Fgju&z?23z=)9sERKfE_0ISAAXBu41{ODHiejB$J*S_a-<=An#RkgE zFvYuW1bwA?4;XGVl04-9|6ugINOE1Zzjx>Xq`%|4w+a`8Z6xD~&@n1p@ca9ctv&tw zON*-C^fEc0TzXK&Zqzt43y0KBDY$cS9 zM05$QuS%M)*F`%>e$Knj$BOZ7b3tCjbn62EPD)DpgH&V|vhDr&K+d5MYTM|^c2Mjg zd|`X^Kd!6SP4X6;z3na)?%NA)sQzr9J{1Q4B=g76;A&yHttI)AJ4JI8FrN_=0VZ>) zmO}!lb1RF0!2gm_TA5Sg%VX5L0x#QQqIUvrcRoiL>Oa!^jkX&YFRHc4+}sx?zUz3M zi&mp}Bd}MQLELxH!f~0tsoU$vw0h?5bhK;;N*h8!nPmHJ`O_bSEtYWwZ8P@B=?+uT zsbSVN==|W;tzAfvl1c(@4wYFOzryuDqR4$G<6Xyr$3TFeE#lV(i}Q~7qMjr=EjdVGcn08J_k&Gtt&m|5t&GXU#MeaIA4_&~q&i!4c z@h2|1RXet6F{1D@dB?iK=T)}9boH+W`W$@rV~r}hv&c~E{m}emlFe{qZx(_HKmgRX zE-Q4w;KF$X$(hZ6&yINvk7Epi=d9wk2qTjg-dx|z_~C7GWaZDVGsSw3A3b7tY?6%g znp;5t40n;DjX`!x_I-eAYe|awma`={L~Lq#SpLDpmKsNQN!oNaC`pz&G`tHxBF1PNIK7E#TcGD!mr~`@&F2)uy$QUulvF;V-P2bV3c0H=xHM~{h=SDT)%$q^r{?V=Y6r9o~uB&UH=8|NTLDZH|t zQIzGLIkHJGUnCL3!R|a(bEywZ(>6YgII zfIQfLHkIF-jBL@;p8xqR8X!WXE)Z+I55|hRn{eI)Ui=qNhHI8T;Uo+SOW<|Ir^aqg zlDYmkRK@-})U?Nrty1bQo&*KFIYY01J1NOE+b?Ztu{x8?IlY>S`V?IZEk(CZsn#OK z19kEN06+FN(d&8I5|#S4yleP@AxYhzMdCVJLOwHn{iaNZSDyBD@qBCVZMvH{UptLc z);!q}yhn-|5`%UuJo<+|6G7^sj3XAeOHT{jC4J!AYiTn9I}rT)zr$rlFZo)FQ0hx* z*w10Ni_bUCoSR%mY71GTsk!T=V1;)-P~Zhxg|6YO8>nf_)``x`zu^1t+Uz&Y*lJJK zalhlTtElhqdSPu}no{K?r_c9%q)4KlY;<||PEko|(?1avMR`Y*L>TXYOmZINCl#sg zRS&KGJ$a-T6N>EmY)mT45yUFM(=lB!XU;&xIUGpWvIC8M0!NCpKl)49GUwYdk#7Ae z3+CRFLA3dsJmOzrz{smRS8pr12_yrg`zPr}??Gk)wt9SpgA)e7oputMzw8@QKXw2B za$7frgrwymByC=zYJ6^NlkYf6)f}?r*OdAQC@zxD%dz<4NsSkc4+waoD0a^8SZ#Nko?k3@gF#37krJid?PgPy1zDCeCgHb z=;)YN|DBx=IWaJL)E#3D5a-{kN=J$FvWTMt30vr$*udE-wPYm^ zI6E^_(}RZ&2`|_pyJI5ureG=sjHbJ%XUgQsXyf|@N(|Gp0;~t*%?WM7aPeSZoSmFZ zvE!rV0~oAJOZNT4#cN8_u4PG@rb{1etJZe4ALs9RMsh>gkgjpvPH={j^njKXLa+Nw zlxx zrr7LU10<=gx|;7W^fNJrdw_W#%U8Wf24)k7N$!Ly1(8Wi!!z;0cYF{8>RbQ74gN!# zFjaR4)!M-!1Bheua!s=yA(`w3qw5pLQ=ZL6kLpdCUEJVOWW16Ab>X2Cop>u`*PzPw z!F&wUCEXZ&=JoT?xD+GXF>@i1u^++UI4^a`!1Ja!2V?fFh%dr)eaDDOp81vr?ELp8)n4TXU+(KE zU9hb9JsCYd{a9vpS#jdb2{YnadOk_rw%X#`RC2c%RrRhUE!Ph~zGFAtEy6&@f2KNo z-s7d`7P?;fJ_Yz*dD_8FRD!rz-Di|JfDStyIAFMs7d@blF}G?juS?U2XG&k5iNPxn zz6~C~8=C6%>wubCDnN^WNLgGcP)|$%UWPPpD|Ae7fMAEW~oRyElYsJ`D>CQ@8xEh;4++@w%A_newel+Lk86SP7p_ zOjsa-sIfuDmb}HrT*pGFWxG%M=!v1HbSf<~lYQ{Is$R5WV8lxYC1nnd!Slzaw5GPE ze@?JDx4-ox!ph6WA>`n%q;6+Y8HCZW$ujMBGFhVK)K;BkTbYkt)9BNP=gb4Byd&-z zs@@Gh6%8=pKphdWWctcZ=ZIl#9iE@2#J*GO0eD%=H#2P>f~$d}*S)rA)mXxq zk-ot_3p5MP#Nr=`-!6`ay#++8f&Als1=aaO)4tz>Iw7iOuFS@D&k?jWj4BGaIr~#7 zu30BshOI8edMbLsj@b?M^>y!a&BPFiX(BFMnx*^S(77hwL_F0GCH_dl$~}!V29y5X8mLIkidnj?FyX z>zjTumP3sB{W!g1zQAtw!HSg39!yBzSdfpY_Coe&QuI&H62s_9Y&5%V`N(Hz5Cz)i z#UoRL+{;wE`V*6zVYVrK@5=ryv?c80>pWT73rnJL_lWlZK zkI>}Q{_fS$GTik~A>6`p4|jHdsFOC_i#gbQ3)$#D<-{iZ{+)Op<7?kx)W<&l>8Y7m zG7e3TWQA;bc{$o3V#kI&480P-jQGu8o(+}VTvRK_EhG#4SgM;H_xzlP(&R-DUb$kP zxkFuDT^Dpvqc3Z%p%qG1a#^1v&B}ko5%2e(ygrK>KQo4)@u$x91IdKM$M}KyyZt|% z&!`rElYLK=azL$eCU&)1rqD!I;|os_S2uw=2LpvD6u@0U_LdG6DYxFPYS6(z822jg ze-$Q~$!v8qVjz}~e2$d5rtK*YD)*ijlbtT6Iel3T67&6`&G)yu0wUOZd_?OHiq=wG zYmlqRh)T>*J#MKCpN49DjElT_nYKLPy1kdK3b+##0~=e@tzY$3_5HTlXV)!-Ad4ou z=e<++o}rg}18Farp4%S9H}UbspdKPDHx92{A9~|cSy{W`Hf53m|H(=i|H`Z<50CXY zs&-ItLiE%EJ_czpEc{FrqP7Xtd0yV$K!ma+_*V|D=2iHlsX z&q|NZ(*Y8NQsPV2!`M$vTOgF~o&`Cv)se~ZmD(6m0zK4^@86$)JIT>IUUBZ?sU}Yj z+(Ip;Qie25=j*xufjoBXb-HD+<6FV${{CQ5HC)DM4Zl%=2i`dR5>Q&ud#4h!M~?Pd z9wjRosGXDa!eZ;D(=V(`)R2{{)(RKCpPYMpHd*#_SMuGPTLAl@-6{GC=1G42KZM1+b5PSkHWsfmLSp>$Xoh)>yTj4riZnERMGe|XE% zH(9E~%f6q@(E|0D8ZLWpELXMgAaMNJK8gJwA5=iY^gb0`lsjJU{C#2E-iZi{C+5+) zZgMEx=nRwGfzB|fXLMyN&c2#?Jrfg4nlV+l;r#+vtB!&rP=p~w{40aYy>%obBLiCU z*w|KF>aXe84-RpX)3R}dvb!w@mC1b|9{vRn0YQL|S8V>ZNR4Tr>1%0~S5lIb)xG;R zX+$II>)?>XxsA!MG854*@Ux58(JkH?9jpneamovV9y$Ezt*`NBfX@$Qs0*HTH&$*j zwl)CY0bbyFgZGnUZ7wnl0MY@V-wk&_07CY;pg;0ot7xYKxYKm?4Dy;#}K)LB%wuin2;XX88L%Nf;nm#~AnupE_?rld*VXmAgb)84bEj zz*$~NM=mX_czVcK&7wzaog8DoJYH=XkKlqW8!E$JoV1Y1hOj2G$O@Z8#vX$op<=_P z7!PK+qt+=owC(AEckl0hK=#i_L2{Fi6jYrGF1hQ{)TZ7268z4EX-mPJl+1m2jgzP= z(_;#O(z~YP6N7KKhDi6X*O)HYhAH?cAK+KfvG{#Y@ogN9T>L3X-RoV5s^`V#5opuv z{E?K)^c%MJ_P0DNdaD+7NT00SAuySeO~w9?%PY zxWaWQ5Az&4r()8jyQ}L{ERR?HO0u+9ucGnC#@f^+Hva2HJ5M{wW3T```2y4UNY!3= zP2HJ`6Sq(LEGObsqK^3Ib&OWMUn}UDvj3E(2r?dT@pq)>cNKfN4FxpwDl0Xiogvaw zQ8c?_Kn8sAy|LH`t-6sd(y#iWBw*0P<;hNfF90Appljhw;xnRZ0wFwD$EJftEp$R^ z*h)Mc0p#+X*iao6b{PEKr5@Y8{)h%>MtIr3WeojgwT|5Op(yFigYbnYYfC15eSP=K zQ?(~4s_#Fg9}H>X1NwX7Y-XhfO4emb4qrX$!>ATRpn6`mT7V*SIe48DW7TXBep9@6 zWAA_kFed=bwPP2lRY`>}SRDB2aG8Av+`HkgD!c;B@$i@KFYR4$yU2!2)4k{E^Vd2} zFP{l%2w(DnuP6INwt^UuQgtQ=NdOEbItmWt(@zl{x2`@4VGt&I^9Dy_I*y?s<0B?I-1|k;;|CAE=$SpHL>ou> zCJPzl)@|FW2!M$7qNG zKUF*5vPNObqn*1vp_g-Bxc#i66b5U}xy%7?XcW6=*RFY&Imnu`F&>weSxf$THH_82MY=F_?W(JUDZNDl0->z@6t4;-B z8MH0t5jcN)d2ycE+0$398Qszo>oyf#l{uZ=Qr+h#W~-(?cCg4e{PkrXoN=hC>p=b+ zky*w8{dJMpul5cjPJVS>T%$9m&2+r!sa2sDU%EByCF8vAD%m2sjd>IrdN48XfI2j5 zSy~G1vDY4e2iPBB-9bs=NaDswl(|K8o2;h#Z?HXa%bddYLj z^>|dhX;!sDMA<=aim|yNc2*dO3bHGb=fq3)m^l3uTb9n`pEbR*#dhFSBz&q zecX5o?5+hwOKm3EJn^Gg^1RYq+TWKOM?WoqLHjct%HZR~J*w%BzA`aMAv2~$U!c9< z62*Egpmti#iN%jFsrN#3_T>Zvs**R5J2}ONZC9_L?R4IS5vp~?BX5yOvJx{xUHZ+QPK3$V^P%CY@D>AslPG5XI2YE!UQz5Y#K#;wDBKcWiM zJ2!a9nA9)ng`$3OjOSPWQdxYV_PZLbxIgEVms3iXt1<)|4cWYI_8y~**%%*0;NH#CHYH7Dvt~zZRY;6lpnW)_j9jblADlPdvd}4r`;bI zmZ-r__9#QK;x8Fs?(%Nc57UTy%sVun+v{hUO++zO{ow+6OO2 zi&kOP#qBhsp{H-boqKh>kOXDD%Y(lF(6|pyHSYF7Tdh){7eoD7KdFHaC^c(5{l(m{ z5%Tr7V2AYgTg>cJn;+F=d89TWkA5E2sCH$6>#?M5IoxppWtHAN6&UoscJ3suk>`4Z zHlN#X%?&^QbAwMh;x6nw?4ITJ*}mud{bd)Pd=EXuA2yC8#2D$dY>{{I->Va+Nff{s z!SoEwxhjybkM1YbN4^VZE`h+b#YTyelubs(1!f1}ruJgc)buL+ZkeGE~I$6Voc8D)9-dg?~VFfe+7-8D` zQtA1BEsgQqbNIH<R5ibGoa>fRe&_a9fDT#&Do|-EO%~s+EdvcbPK{3 z4eMvT8q=3`Ot1n-Hhk{KggJZOW{V{{&XUudD|=N)brO0Dg<1Ye;3GJcqM!FSPA4CE z=W$xEr*`iRCH0_r@{dvCL=;zFGSUmU2;cSR*OGr2yr!UQz|DIMnoTP0f_YlK(Rqdihk&M!I`_VGTUixhPT9YdK6Wj9k(Mx!HN!boR0 z82mX{Df9xuT3}Eu#Zd*OHX$%8eSEM(?XKwkaH^(Hd^W>rHhXpldyO*sL^11b!*!)S z_=yE|bT^4Vzl9pOkKs_O(tnc){#DvM(b31&xL=mH_#~ES-hXi6wh>=u(ZG+|d~{jwLgq6*6Qu5bTmpEC0?mHA@!t!eiIjb5ocUJe zw`nz)!YNf}JD%RrdX@k8GWRBA;*Ng=3b}Id^y0UiuV&6=2~P0FIMjw_1760VKFS;# zC>U~qY~Z54Wd)%#MWinq~X3C3RyjG&;D^gx9m;9iJ4RvD87-3u3WX>3*@Z#`_MrqWoXv!H6r#W}c1 zbeE|fjrSifkIF*$njyRizp{`_YONUKho$KX83XZ%<#)G_&%%L_p^`|kC~gS>IIm?3 z+L{syj!*O{fj@scE4FQW1@eKAgqid87^YFQl3K3DGZZD-ax<^H3AAa=-rk$Et|*U{=!$XtqaQEY;lQUrFCaB5%@6+t`~!LpBYW&NeXlOvEG5q*M}P{8RHTJwo6Pte}K_fmDtHt+8Dm@R-$--jQU%UWc9eH{J2 zX_)M7+2D7mIVnio_tJ6ZHMfq~NLwr^cU-}8acKdZCU3%5 zmNFmqz{cwI{+33lQg?i_x#6_hLux$&F8{&NnSZ_~9c_-sERVKb2PF_DDU#lr{&pa& z=aMu(9mG)vU_!EOdZ`)Kgyu+0fCkENu8kW%GUzQ-7sq2sJ398+!X@84r@1x0J6V-! zc!7NAfA#>*BJctn{n?qyGA!4PUMpC*+u+8h2dl^{K9OGtKBVn#JybQKD@YL#QEY`1 zJ?a&^)KV^@M z1T0H{Jsq<^l6Tb)C(85nR=#0e5Nay>Zp=1l3HL^}*RCX@WXsnQ-i$oV$?$KsLmn}2 zzClXqgSzF+YefSSl5>F2&rtpt&_%VftGOH5LMQA9bU>7v)jO~L>>x%oW`JWIOa>>@cOITXT5`(=q)71 ztWI~4MK#Xa0iw3?-t4~w%AKBzeI966?W@1J*!$(R%|l48mZB6+{su>oh_a64f+$nV zy*l3*Texn%*Na8TyO9a6pTs`=fl|0g{Uova^x93`1wMnF<<14HLD$Nf=Tb$fu;XFo zeXK4ZQlE-_u}>c)*Piq!W*0Um^K?M0J!1)uJ5y+7@w>2?{CoGC9Z?}}+;7-1YZ#EJ zMU`fUh<Xix|Qt;iu$tlwk9C8!E@ZklonP4zqSBvKle~ALL%V#xWbJ`Zeq=Qv}eB#(R14*uFCx& zrQx23d_-gQqtE+KEk+i)Xlpg@+Fdx#z1%ep-nH-Y@9GY_R%!)X_R|E%iE`y@PVq#W zCVT@5HDy^{AWM*;pP3u}#!4bfRgU2`q+&8L9ZyhG5<`}~yl7}MMLNDGDny9&UTq;D zeic5(^6#|m_fT+_h#zWthYro`CcCgRatn9n2LilUlK!&HCJb`+Hb$vf%uWW$Ha&cP zdv`mGF)wG6<#1*h8AS#rp=aPZCQRghMGK*(%gh@n23bFm$cpf;XG`xo57#-!>Z8+- zGm5?T*|@~ZaJsQbEm+^ug%4+1ne@@p43r%X7VE|D1kMXN`0bh}FZZ6Lhwg`vckpX) z4D~+O>Y8q6=k{IFV1QDi^|5^wbTVR7hH(KDRA>{gA$88avw5y$wo%lL124}kshrO8 zf#MY50pb)lg+PY|gS9)6d19(W_DunSHOF`Rwi#TBOV*y_u$F1f!%0G(Y#V-`*01M{ zK#BA$RAmj?vL?=oJbtuGc&&Pcx?nkYCdlK1(EIcIl=1BG6d#>%_<@=q;=lZx5M`iT z99-}JnUQTH4HBS7tni1E)7ZGdwhp_Rch1=}`tPlhLRQMD7;=*Fgaa0I;Bo$6w`sxS z^sYpOZBnmPY7^$oTCbm6Q;o&;Z(8NaCHQbKPk5WxL-cJmme0MmpA!4@=T_8)KG=c1 zY5PPmmqJ+jBLHTbRs&P}z}bWoxT_ZzXZr%Yd&2I$^OrNezyV$Az)g*E?SAAlwEJ zmm$yk2m>Wo3r?dCdGsW(i4}J)FFm-C%M0igneTLn7eK({wj^r}j#cxT)Y*#GQ3o#T zcTe}rMqL@Q<*PIk!Qty&PpPRiP4>k=-kZI53Oj(M{zjN)tsw+20}uG`f{QrAzZxh} z)BvkNppfd?C=|9I*NbGNO11{9ZtFY`39lN5exd=})5H{Q(1%-b#C`G`QDiKLO z>Ie)d)?wDKME+7dvO;q8(M>uKwDY3OYs$ih8wWHI{2H3SpOj+Es+4I>hyZX+(aZwPXHl z|48Sgy%MzQR;2B{YMy83F+9Sa1q*lNo`CmC^m;xG$lMd;h~TomuwX}nw`q65Bn-YW zT6%+F#eAh}DM(0ng2P1yRXRpE?{yNa=}>j#T!+=g+JB2z1XGf1yV zQ7l{rY6KcMhotvg#&|DTI9uNzqRfkUaz%C3(4c$o4Ht}fYtQ=$Y!ulxN_26SvuNvK zM}`}P->&W&I$`oss!#J#-^r>{mj#LIpzY=&s2Sj|^}FB4Y-OHDxD~xn2P&}Vz&k4C zVl;(Y!c9HpM}vx1zU+xJ>=uShU{q~1MJ_*^hkkm#ZRS^cJ%JB$@0)o8uqx|2_!yBE zsGf3XeOsAc>mNr|CIUjPQjf?PS^LqftT|&$9?o%@>6euC<)TCUi}^K^uP43QAuy_x zQ`tZZ&N}{N8Pf#`KF%cA<}10!b02&{sRQtOK1(gnM{wItbUID?{n_b&H#Fv_2n`6Q z-rY6F9;?BU@A~;j9JN! zjApCJU=oVYfHZhoW*{dPcEFr8)7RX+QqzsdX8$lxsAgFZz=yX^0>zU5gGq2GKe@`W z>hb#xXT2SkA*!z2?jQJB@P4T@_=aq=GG>Ihl$P09pf&4?j|IA|&DSB@K$#ES3X91< zkzR=Ya}AiD=JOH)ntGDNp=sF^n(3|);y}xbeO3jU>uMu_1#RWjZdqvDyiD$&C za0({i0d{b>FJt^4+uYen;SE$+Q|^p@BFePmdcII}uw;G*)>r|Lh)A^p=T7P;RU7YT zGi>}?gJPU7V#@E|FU5%z(PN!)y%Jd!TpQ4o{N*ezG29nxK;f3Ky4;p6RmOx&BG1!$ z)mkg&TE;7%h{jUYm$H&AC3Be^j^g$?dPX`u}6Cnm3i0*I^K4*2$Vj?m@VSD zZOuL68`&Mn1jFR@ezWns$-F%!1kh!z3%L1ErHwnQ%lIj31$(Gx!(Gr7$P#Op=SZmM zbi|_jn59bFc_x$OxL{I`MmgtBT~p^SjI2wXy>-vN51g`Qk75w2kiSh?ycZG=gkr## z!O97Xr;a#2ZiTk`o9D0;tQ|u#0;X_f%ssAVi}-SyCEqm(gAsMV$Y^xL*yLdi7bIPpFr9IKyd6qI`UHfGo)1Enc=t}0-$ z&_&J?PD4=x+0khXdL5l(1S(S^f~cYEQCqgAG(I?|*$2MURM^7hyf_wcuBinM|4f5p z`v=_Xv*Ny@2YYmxx0m>w%rN%l?tQ>w^_JH7c@$?1{cd%fXirdkmO8dkFz$Mms_9&& z+x^C6?)rW+jKKN778G7T@n_eMhDHN_kOE?9^^pmHWG`g6Wyz;DUHGxsaHwi3`dwcE z*bok@L6GFK=NL&49HH&!HuwB-3-e>wzSbx=A|)1U4)_p*)B38-F)a#(0g42 zaCp+I4BUQ|^Gp8nA8>1ZoXAM}UT9fjF2o^%Y8#ajSPVUS*-d-GnuO}H0o49+g-}5| z7Gm?>?`2e}Eu>wmjX9*m4bBi5z-2XoB}Pnh3m-?+yEhQz1H`g5l0f^7L2&t{LQb+V zuMO=sj|;YBmaneBdw&T~?#P=llF_w8UY`cd{!6a{hFFJ}9T8lp!I44NFqUsnINXC@|W z2ehd%t}5`sEA{)K@~FZefR#uIv%Nn5*VzEcq$1K!C=_L;PesPnCnU~Z1-SEW%`CB? z-iGK{I9JDuEVu!nI6~zC$MJkA9p%?@6bXobOFd?veH9kCq9k?qp81k5Y{w^yeL|0H za#P#VBDI!eiMPOd^Ofn~WxneFBKHAV^rK48MstBbaJ*H>aIy+XyGJ7s5U>moRI?V@ zbv&0a`bTa!Mo|0NvOmRcXD`i7Yf0;TdrCeQ@tgPv%LZ!z zHk|u@5We`VxW=a(ZR=6#&?jZSR6KRD1h`lU>XbLwP>!^$aM zy-7;t+B{+%cPoZ36w_++caiVfk?PPwu36x3isTK)Y`DuQ2wxuepj_Qk7@h^+#uB_& zEieEY8FwP<%Q-De+BxmskgtRNty5kJtD_KyEBRn;km#VjCDb07+HiS=qzpr38WrlZIk1@y>iD@obitD1|0BJm>424&l}krMKOiG;6qtaYnk^v;N7qT9Koe)wHJLfo zG9G#X1+6i>Asocy)NAnM;n3IA1jvr|zW{hSqXl9*{bp-H>H=A>7N{^dCA>$Gs{6&c zfqX>NU$1C_JWM9z>lL%JB1`rIMAX}b4^|hSiO6PHTm_zfxrpIjDUw={tY0Bq=HD?G%#KmPSJ}DSb`+-jHCHsi6=gSce&p&BfEMT47M5q^D!~d(WT(XT^ zx%M2}oqPrK)tyikL;1z2e**33U%-2B-Xwe=RCH=7AVBnQi0i9#O-C<^0>dHfq9?z} zG~{?PaM6l!q_prr3e*DKn2m+hvDRa#u(BN23uBE~YSse=;U(Axpw_7FmHQSm-Sd!fH2?|TDU&SF2l3jeKEfY<7U&dd?^C z1f3v-#-j($y||r3k`@v?m&-1W*889U^q(i5V1L^Tlx#@p!CUblFIo4dkw8FV`~N3b zp&G>p$2MF(D|8G|KBOYZdtUr z0z3R43_OlLBtw8C+#uvt0X^jhzlUIIw{ywHHLNGE9quB(p-7Zj$k9#00Y>8b1o&&p zW2fL=1RCSNp0YGmr-X|@~FK_kQdEKxH;9&yW)_vj0tlD|MkeO+|+3BymPaG~Q~ zYqMc*@A`Av>sAb38bL#=0*xMCfDzcyTsefY&b}z}1?F6{B;`*KOsT?^yPTP+NF!e_ zq6ZlTn|pk{{F6wOqXCg4tF&eFNU~9d$nad$`8AM#$G6MA`^kTG)$>8s()b$t;z&in zKb67sqyRzC%NvGGk7RD+=ZyBjx9$#Dfq5#Q;8a;mu1pS3lUlVcfk}(AbD4X!=Rtv4 z?%8<%OGx&|fU_DuOWR6{Ming7Ms`iWTQJ~MNS%<87};u}hyGLF?KHTUP}JENP-G7525l{3@ilo2 z96C6!c(F3WMrhR;+=%09$d-_?X;eC&)A-3bWnX(zMCr<5Y6E|nYDOC152FT0p`B38 zwFUY^%q4p>D;D~zX=w@fna{VB-4ZV?adzn(*F4gOjo`lF@3o!8=h2+>5to6aWtu-T zCrsf5I~~U2hhvZiYi~)vKcB%ru?-%j;c9(*8*&I>#`EW3?v|EcN8j3}Q9eKLGNs1! zG&?X_z$?%n6Xe4<#$;;x`l+gux~A{^dXBC$6|ZrV9Zg%aJ~}A>FTLfoyP665Qx$k2 zzHOd=26%0B9&l=$8S`8a$6a=aCcg((k};q7JO5dSN!F$(`T?y6ak6%!5^35Mu0&Wq zR38C4C`Y9SR<|L6M&79ILiP9NSDVfcSgK~teuJ;>E|z%_@1k4A8RRr})lOlH+= zU`P3Em+%Ic!28`Q;XGsBr+tW+P+6I=+=enqgT2T{%9boxOz33%w8{pm`t`2Yiwic0 ziU_H4qGWm{IZ7(G^9;=-)(YWcyDES%`(Q20fbYt?-)zvG`-Xk_Ic*@e=TsZ+kYw`B z`px1HF3pPrE_QIJHR;9tU|!Ls3-7-)VyGwJYJN~~sy(=0VOLjy&&7=0E2Cvc)O|#& z_Qq__9Hy<;vbioZOJYnHkFwXb&UC_Gp-5BMa2^av6B2+K zeLPz+(dp{%<`q^FTxd_XgJ7wq4rztB;*Q+aDJP3Zs-3E1xz&M6!R^Al#b!~;E>VDDYz77gRHI$zG4f`(UoOcTe52fNWJD%L6|X3udB|C& z^#V2Bki&g>CGu(2@f! zS^?qrR<5v}4Mp$XzD;-dDeRms&~3ITavnN<8TUhwx?6y0Zj5KllO@8JHz8_0kj|Wl z?SCN|=BGS~gxOJl=Lk)R^AgrHItXpZ5hVTz*J6$^YUouaS=5KOwR=w*noN2GhqUVO zGRYC;A~Y2s2t(TJkH3;njX~!F($!vq@ppnn$fk$VU`Rx;;F|>0@R|c z54VC2H1!sgsDuJH|ExVxOt2+f3{?Y*!`lEIKwv=Y_3KPP!ii@9IXtv(K%(``mwI(d zcnwL6%1Qz5dV#Aajvtif#}r1`qn1&xzQnmuFj{DB=T(0Be)%Bn~fF7DVY!_me*!CO30&sh%+m-D6 zDD>{+7V5yhG`wxi4X9m=@vVO4{P*-^!qn1I3gX@NdPPp3+wzG>8vMn5<1E(@Qt7X2 z1Mff}SUd}o=`*kPLuKu8{3Fm7!BMW0FQ1c~VY`YnR8s-NRM& zig&Q>h2;_FbEg5*hb;q_ID4uJ$P}^?6vM2^(jdNG(U68AGmBhnL1V5-J=ne*n zplGAb%twfGo0><*xsPza!s7NGfWQY1&YD3flXL{EjA1Yup&!9S9)gM^kANP3Eq~D_ zE0|!7sl^oaHrjuKP(QR1tpp3=_m;8?y|2M_-EPQ1N_&(s zCh9?M(}=#!D)+?H8oDrQ#1B9tjWAIjh^c1(H`~?Br^k8wZEMSEU0PFs6gKZNX3CMt zOOy7cveXeprVxJq7-+e}eDHqc=V~9y8=+uuqJ(B?7)@arr_`yQ>n<8pmpb(!tpkv9D=@!RcXd{#_qF+ zKQ(PUWxKX-T#U9!IkhqhB1Pl{M7lBBG}r3GG=Ka`zr!S;c3#LN=czi$62wMwJ<6YB z1v(j+l8dtKFitX+clf5ScTQ5`G`M%COz)@8jry5;K)Oi7cT;& zzHVi8;EKsQa)JDzlh6*_#V?(+idkZfr!)$BfW`%0j_l&geH0>--3#$&r?EWd3OweX z-rG_{_6J>ZXeN|Q_#qV1m!vde%&TzMN4hDHBZRJuNTMoDM>!Z-{epCP*bjPukk2G!2{+x5Jkd$3TX1480AAa zBZGkDfz{UKqX>b-6w7xNX7 zt$ws{xA5p)hq}5p0KC5(dRvB&y^c((X+k_i@YWbyfjZd>(FIdXt)R(8sJSoh+pYbA zNp1|u@C_CyRZ1qd<*oR0r}0Fje;NGLN{#Juw=n@uax`VRb{2rO$bv%=kn7>I2F_z# zQRqbh)c*F+tXb_)Pb+h~TiE+iLpV`QH z74BJx34$IwrpavL47+$-!Fc5TrpE{(cGUn1Z1`|rWN^HwCA2^JFZWz0B3Vv8DQ$Y& zPy&&G93GE>hu0LXS91w{DPnq%arA7?G=K%Ti_kFex0<8Ig{Q(mX{pf(6A~|IQU~M= zn9eVME1gF^*$R_<0)AA0m=CgWGaXrgQGc^TscHzP0q8ts;VlG?Xfyky&7eP&jTGRwoizJg=XAL+{@@6${oqyN&Bt{*Lo^Hp;?FLzHD zEPA=WOtK9l5_VTbzCHn6u)tL)ybW<@X z!_k%|r438FL``4MwALGZ9_YTow7@pF@n;EGz_!O^&=th?$&vT>hr~i`VMkYm9IpC< z2E2GF+RSQ+4$hBe()H!e&9f;TFBb?f6sv*}WCE1Fq?-q{v64t5=QPPP_3H=Lk_{iw;oL{?R{suGLdmw!3fo0S=BVMh z>;xf6c&oY(iKhMgd+(7l2O&H8dhCNQzk=a-wjxuim*j53%+I*7T`tw-fssA@WC{bU zHM)cKGJlo4eEEX(#c!8dBRGi3pwXgT$GW2lYtL`dHwsO>q%zG6~aNZ0qV^l zeeM9|gSv8*Kb$vbA@~fm@?&$~4+5DX{m4L&@4nm)Z}!09BB~SQ?0}Tb({rFral#ob zo(=MFH9tB0hSvZBb^ZrN(Ks0Gc;L;^zxL(&3bkFJ(;MzajjCWEj8%Y=tM`oslYy%H zkLdPh!Q(@SGk5;1beAH~MI)^k%#GhIuD1Z3A;|}U@d14*<{-QJ9*At+NqEn~1!^8` z)!oe!`qby_*$VEEuAZ7ogi#(+5E{^?a_2_Ruw5**+$3rkfjDVuMT#g)=|#OKysZaF zssm7kv67uUSq6BK=cN-goVOe>r7DS_9m5G}k>YZcPyiAq#yQZj(;Ujf*D;@I2pM*h9J(6AW$6mPG+$oc$rqOxD@oekquck^CK!ObSC$o z66Bf`b_-?_R%;_se){vyxG);V2=@StA$Tq!no;kLrwlDyAdEZw0`v9ulXw)@1fv zg4R-9qNfCb8nl^Nn5;s6(Is=j6u%H>`N1z2kGAfX2V3qa4zqCF3v z&bCfCN7)>6AsMlTk4F@+06^b8sMK$GeL1tF;bW{8G{hr=KwLZ9qtYFx3fJ?64PCW9 zOWNOv3QEo+WUGZNzP^J|=8A>W;8+3i=Y|3C)d*@ticObX)scMY$Ej;Xr_P*0yx4fk zDtQ9{)9~e5Ly1VR4E(BTh{>)gji`bFbtABe&ZhSuFI@ZASj1R%*~ zEU+L3BNds6*%;vVW)AY-`Z=YlFGSJ$c!rTQAJh&an@I=oc2i%+umxU?`)^w2zxb0f z<|#M=T&;A>nH11yNt6SX9@Zm+Asp9eO)u5JgsjvA#=aeXp`C=~8R&^bJK`v{{0C_w zK1L)sChm*eM59R{bif#@M%NZuog=jKoJb_+c1%71F@RGYqZc6$dQyqosG+K=Gha#eJT1(3GiFG-9K*)Rz)SCBt@Ap zDo{PlM4wGBpk%UY8%Qg>@F#HZ9fx?cH6wmdD~+Irf5+{G!k>T+hj>-0ygJ+54bEx5 zQ&kfgj_)KveIm;jS~N}kW*iLt87C2VB>Vmh8yb)td58I^Ec#u(-dB|E^!~OzdwQ>! z)aGN-B4R<=h?9{*tZWp z$V5glJzWXAuo`+-5_L6dhy>)A&;JViY4VJ*4>cmMx7QrSdIur(s=_DNgMx(n__Cvy zALC|kwL`%o@~3Wynmh~hv=C;3{yKAp@d1@0+*XkJ0iD{{INcVlppkOzP44(@;?FpvYGX`+2HVWl9)ho@wr2YdUG z_YS5Xz4pjL6_1X#5s6A*H>Ku<7dCI`Y)C*Kz^@AS8vR$6 z^Bj3`qBLZ>$Vr3Rl-}$}zOsJQO{Jx0B84l4MjW*o)Gx?_;I>iAjCghOkp^$WRcCP3 zJ2J1Ul>|3w@&?9*03mT%_#i4*-_n`xh33Rb&{B+yJzTpEwmw6}&Sa#m z=TX3^L_^eb*jF2XT7t*z+r#(d@4Ew8`e^R_2Ach=rOlvp(rb>3jCccbWOi5a&Xb%O zeSf$G23{1DV&)!FuYijPZ0kTyJQMc-Dy)&B=^3*L=}6@JH=N(Tc=c+*hBs(-l!tU; z`_Dv1`LR% z+&5AN^FyKyd_rBU1<~`Z+zdR2;L3l&fJ{|(O>ClqW&?x8?L*A4_KNv5Po&!b*8#jl z@%5~jOxVQ>ptq6*6@di*)=*L99}za^rEn-myNG;Fun>VB&4DAeL-XX@puL&|E-OZZ zKe{FeEIy5f3_m{-xvd z{p22%&KcoGPj0J5Pu=Tnsifgqp|_#ti6?3|E-_BDQy63&`t+yomy?2A#S_}v4(Z95 zN&^UJ2kHps!K~%-3w@0OS4sKx6y%Cd&^fWd5>lsO;{TJm@~u^%8*lSgh9g9Z>0~-l z)R9)Nw6raA!U$LbO>KxXYB*@p@=?8jfU*&()HIyT=C;sB{8Y%-&<(Q=+DWuGY%yd{ zmd9%&tpKxz-JOt(K@;+zY{+QDJW*cdkl)a+C`SP8ZF)_k|kQRa$XOWyP67IG_~-%HYOTG(X3 zroDlP(hyW^IeJnzm+6`qA6{@ZW?5{%)Wn3sI3q(cflLz4Y8!l}(=Isty;z%$qHCnz z&yxUg((|3?*qX1d&7WK`E{Z5UOd3gH>{Ges_&9pk)Z~VH(P+}RuuUJ2i$xvRG?VW7 zLDMWu6b!NkfD0Iv8T0h>h3^2Mt(RK*Bl6;oK%#>}K84I;v~kjiYvum=@ba+9?4u(d zbP~+_QYjo)TX^}1k`WoOD2a>W9#(^!?;&o99~*@74lt=P@PB*gub-u-AU8&*0&=;t zkbf-GEI^9R&*+>2CLG2w8y#PG*ERCl05Z zi%1fi`yMeNIM~yoi_IdzUOoh$k6YLeH1ucz@4#~iP5h!pn8=o#@8L7kWNuldqGx;@_(iRVdMf9 z-(ytyvW=*~LuVfTxB&bSOK$6jTg7%_xV{KHPuc$PW80*wah!JMze&EWDG7m#&WMao z(2%r%^j!2wBONMn^4hb{#Uooqaf&hxk}t&vq!USr=c4}_+R+=LOYO{mk!uV#`i@%Ej7ZnIaIl19}LrvTz|NykTdG6;aXkkmh+ zs3$0HaRKA#8w7iC<~gby*t5EB+N^YM{tt=}#I9ygTQpV9VNG{_ESb89FR>2=KvBd( zAuI|l-rj*>^Q9X+BlL5pKJr4r&M%?d~*(B2U6eG7aPQs6@E ziGT4I>Jmm9w2%~MEIW^ope!k!z(l`?nwUsZ$I)N&!`x=!gZ@N5b^B$dOEH-aQwg`j zpfTx6QijA6%#A19a=8Pb5dF1ZOkYd{XsJjT+`PYb4p zG2TlaE@rrptlGm9reoTBgA|5;gGatJy)dv02MULtT5Spnyb`%FqJ;uua7VrY zy-ni+skQR_w~>+d(-_P?x+dYB`v6}RDBQ*nEs|GX%)X>O99Q#~(tU>dtANLOP*#~A z?4{ZQh=DZ7W0W~SihIl1Yw-4PKhn-<8eGwQy&E3wdBV>H27p5W{=s;zjKB6!cPSHM z!L3lx0<(q(`c<@E0n^gk;1!Ck1C6vq<_8-wg6n_Jsfjxwp1m73{^C5$zXAliABgcK zFkH>NGM0D$d{Y+zLMnWJeXSVc#^gB2!nI`WmmtIY`!|W*J|_a&KWgf6bXno)3#~HA zoz0z#w}(mE{LLHaqEj)4@}=T)(To4b)^`VT-G1+9hA0^!5`~D!$ZVjpBCG6GAu^La z;*FLSG7?HDEt?`MDY7F)$jEq#$f#5_e%H%re16~mdOWI z@>9KVPH4{5smqOzS1O3)g&^>C^0|UBxf3boKMZjN)2krs0HJ~UoOjOLX373&l7-=liXjhG7 z;=yh2hGIa<`Sn-C_dke%ViXOGS_A?!2uiS59HCSq#OkPavjN6O(OeY3Ytn?5ojQGO zzG1~?=BhLwVtKMC!5q7C=O(mYUG#dG@3*j|m3^xTAB(CV`A<0Ts7eT)7M|6=@-R6; zoy~xp$NxV*a&Lbx6Q)2Gc@TZq}Sa_-ewr zQ7(9dR+Je*eF_4*Ec96@Ec3r0ES&KL>y#e zX{ENM)s0J=*OYxo+XCz+`zoZQ5Io}A`&ZLTG9%pJ943mtEl?U#))0IWf`U(ps@MhY z7=Owm#S9vP1FQXWiY_1?jVFSoC~?Eh1Ib@L(T+v2X<5als&Sv*5S2=0AS(^jGE+Bv z0?U*e9^t|Mde7_my8;v}t3)N&i@TfhLT}uHyP_>_UO9ZH2%^XW~ zexN=3N$A}DdT>QjB#>>*Bh$A(V;2t+5pb-_ajRzrhHEWem}+!H#R?uJ^0aNeEl4dU zS)Rc$=}+Yh@jas5Hr(MYR+HfJ{(5j9NB+#)??2_l7IVX;Xia=Fh&9m-Xw)iZ5iZJdB8ihjvMEHy@$6Bvd1D+`QU(GBytRVLNFm()G zJQ5{T#Lz8pc=o5TyAX@I`=%1&(4X!EDgxwm&n83())!s&8}z|x?@A6255L4X$v-{c zrKdx@EWl=5GRUo=Pdo2L2++}wBV6(RX^C4G;gO7;3~Y*8$X>sSktc#*hq8@Hp$A6@ zo#*|3+Hln(jBuJoCx*(B;As*?3#k9P^F_;2$@FJeM$B6vv`evr*H?A|km=)s5<)}t zZ{!Ro)?zBz;n?C6xk`!r+zp{mz;pcdgO5=A|8;7P(}k8k04Qqymi2KuVX1e=FsGt< z9p6&LyYL$GU*-Ec(xhfKD1H3RaFp&NS~&l-`K)gco6+A?Yk{)B_|<_Njg}P+4^JY* zOdUc^NZ0TXxA>MnC1th1B>1`C2iN~PD@Gu?=(*!IRA|Oru;1T$=^rc)6ukvH5F=7$ z$8k|{Cu|c0qxGLG7xBnWvNm7l1p*N9XCwxeuOQ}tkFY|ljB)+L!6*gP?%4l-G6?ii z;E7=h{S8_p(lD@|_qiiHUaC*n8}cghv{4Z)4dA)x9oo$9dGW85mtS}*?!?@o{|IlI z#N$XdkRWh}u1@~T<;t4SA&&WJH=SMB5wBcH5Cvj6HXr|vsYNAT6PN$=cceA@@#R+b z5P}hr#y7;!ZK&GdV4-_46%ltC$)5yZXqNChoI=#Xo(`M$?)tM)@t9zgA%)}>R;N?y z1Z4|p6G8X+18m}hbn@wBzQ_QB_dPHNfX2>yZ;5_7_RfuYemd0z>($?8K80f&4X^%H^8S;$Gb=#>oAY>2kv~ za_grzdtqt^oiyY-e}WDKJZK*LauD^NMrx+!*tC8>#uMSng++xrv0a&IehmYl3qQ}W z;$|on!$OE2s^T1E$UU={zyt#7lJabLrC*>f7JWBN_R9G zzEsaplfm^(ix3_$){3D0?`;L=fj7rjy*Y@7*tpr^-r}bQ>`GJ;0B}+NNM1gI!~{fC zq6hcSZsZ`83+a73s~77L|JY zE<)Limg1TDw4aO|?kjzhxZ%P7(^{to05|toJ+{EbdKzq>=;P#1aiYLg``F+NfB+VH zlFoE=OG0()yOn@y5!=C<=GM+dwuKS|togs{c%s@j#NGd2h1cdq+%rtQAAOY_==w3S zK^Q@>Ua$QA#Bzd+msYtv7)}V@#9hIcNOTZZfRC-W5a+=UC&U7{mQ9OY4~ z$GN>BLF9PW*A&rlBSZn;>jPF^I5&~DK!;sM%4B=@@@@ypf=d_w1AW5_-M7Di#rZ!S z(G)RhT6!1*Ebn^D=a7CfJtM|NQz^o8Mvl zIwiJ^4D5DZY~A$^H!qIdU$s23p1N31feMFXi;p49lxI7aJJ7VNIXo4Uh~HphhqRD> zZ*|3N+7}qu2&u@~qdnkJyPnzRFa7~X9G}d=o6)80AfzLt4J;V6JWf!0uDF8r3nfd0 zB*FGYc2DTQ^Y3vySxfvk@j0O&CS0k0ecbh{0IcvQ{E-MKAW(xN|L3GEY#$I>KWx$8 zx={1!ND$b5GA5M)>>6kYBx)(NQ4rdHpL`_S@Eh4Wj2(3nTt+xWzg=f{V%@J_>o*+O zu)#(k-jN~ppvl?%wXwQu;|^>V&bK#Wh(F!Q$iIIrvbkf7vQMv~>v#r!tE;bM+gQqjE2$Y>OokuwG3W^hdC0| znLBGl%>=bW0NQnGSTFp-GW;!mm#2cpTFztifPdD-mv|4xAmZnw2!4XQ0NhK~fv`73 z;xQm7wDi9jZ&HGL^99g5p%MgA6=uESm=8`c41;B2YYItt2}V}(^T=Uw#~oV*gWy~o z)W&w6-FSnumgEQ?l9vyF{6z@!zJE9sPyBr0jWw0~0pSNC)TA_3fEI*wF*p4>&El6@ z_{>kJ$X?SA3~bF)Tlyx4!wCW?N;Qw6+SPu>xRdoT>gw6fZi5X=FU_b*IrE`M0dqIy z*fmq-4>k}J)4wygVf?tNsnu;Pm;i4Jq*y@J2TA&hS?|ntkuHf$kXO=7!!+OU0T6qB%_TjWj-hP z{osL{Ql5*e;~-3W(Ym|O#Idv#uQuiSeR1Zh@S9HQyBjox}DHgbM5Vcny%DdEyAq8);qz3DQ(>%D0y zxY1tA+H-p0sVMSD(_+^W)>7>X%_o2zB96vCP4h}E=I~BR!={i(3G%;&WCjpeYKsPN zI0A#P`Fh8Qcma~&Jj65B`iZed^!HQ4@;#9tq5c_VFabSuY;W^L@!c*hFugt|9P|4D z%>O<6rl4e{i1j=P%a=~T7!x7sL_c*JZmM1Tz9Urt{@e`nw^x4-!QnHN;8GDqF*=hh zmX#8@ZEV;qeuh{cMek#(lF zmQ>r=Oi%px!e)McR_eb&rZOk^qj?~+4r;I9DFbM(J|hd6=wej-?qu;I^n z(o28K!#GEYf;|##vRdP#AaJOh=N|vFsiJi$SFV%i+h=C8=_{lvAswzbe8Pux-(7(I zzrW@M@g!@F$Pt?_XdkG_CUp`kwI?;m{}y3`xb{A9a)5AQvn=#~9jxe;lxXcw&lz`zWL;c0P3Ar)PJOhw*hG4FEFMxi3dIy#|gzVr7%c zf4-&SX-1GpAMa%|XW4phI~@e@T+^aRIaL++-C24Woc-r|Vgwn-ZVYQ>OU+#GXf10vC)p zPZG$2T(Lx}Y0pC7n*RmeO|D>v2pbs2Y}h*%Gpu^Y$eeD);s4#JMjNqFGuN$dUwT!= zGA2Q2z6eG$I)a^?Eq?Q!maxkBD;m)ArfE{y_M)F#0EQ3hiC(M~`c2)abAkCZz1Z;k zd*hI^kU+Zb)*rOBYExV~pUcRn)cTSA>StyHBYPE|c-qKZn(?38;0X^R>*2|?T)w59 zIB>BNEqG!g&`}D+;z@K2-uzkk=w`|x+i19A0;LH+k8E0$C}~kG5}&cT!AgGVWpKWi zkrR_G%uNyUOV9pw1Q^d#!aHLI35r}^3E)Ge%xv%Nmok^aAQqPABV(-_hFLTgn0nIv zc@k%Wls7hnPJu5vmP6s3S}YAP>ZE^!=i}qvjR~9 zb`byv@0C{}BXdOBfKe5Hox+9qA`!jJnVN&<%I&2ksAZWawZMD+QRgwh3aC{%RMl56 zJs>d>;IV@{{oJx)L$Q;v6hj1WXRSPO;hwaE4|L((_9^J`(|)T zA8dqAur8X)LU;1P2Fm0+z=Y5(xS4TLr3ykS9cUgHfUdazVAM!AZp`)S@i4@|QB9zA zf#A{H5Td3Ty`c>eOz1o8!DkSiUIx4M?Xy4>Y$G>K6jkr&DO$lB2-v{1XBi0R@IxMX zDF(qR`rj7i-#-fZNy>zuCk{?rxG_aqZf4dI{ zn3N0~=7MnGmkeG{WN?Q( zv#R9CcwVM9S@`^z{oKYI9kyfR?-=!aNqKr`%HU`O_C~a%Ic|LaErKbwEK4V0VYQ|B zCx2zWY;#{recpyV|II6uOTZrlKYCGh!A+n;J5`>wVTG*|qgrfG{_fgGXzwUy);SSO_?ZZx-<_}ncEYu6-`u0QiX1UB%93L<<*{$t883T&;x-s~Ag}vbFc>%-%2-%!pC_RC63H+}Tnetm-DY zl??N{nT>?10C5`i=b&w;Tis8F5B!k>Fc-JlSnNJgU8@kOrdn|n zWO(2}n%0dXzJ<|Z^7J)qEJUCOmse$YM}HmvissWw?(MIPAC3tS!(M*J%4j*-SI*QZ z9wAXyecPNm2i%45f$|-^D3_G$fGhS5ZS5J2oz!8=4ttuwkl`D`EA%t>p##$6J*=}UOm@2kxM*h@JP|(D}9E%ji-4{lbfVCN$$jd z5E+g@7x)U}<4lf#5g@o<)}A40iurOS(n5gj{0eGEt6Vxet)TwLbet3gFK5$Tz#QNN zfRW&TiaMrqm1ym3=-iLM8TVav|G9edPuRzi?BWAZctu{{=rsS}gvffhF`0b^ip%8B zNNde^Qsdr~T>ICJx|mfcRxuM&ZPw4vI&C3slK1*B_9xj{cxoex%Va{_gSc|^H7{;` zLv#c?UKV_4+(YUmvzipw2o{tB=Qy~coVronP zP-`r1q0C*Kp!%RoE5(4#*%umIZx~5Kbj6r&2hzkQUA7YKk>XYOXRN;B8?H&vP<41i zXX&4uWt6AfOi^*rVidY(`po_sA)f< zPpvlTv~d{l7`9jdP}NfoK|4(-35S;qEzSu$oW9SSTNmEBiZS}kRk!6HBhU9*QZ!}^ zF}yn^CS zj{9s6a2auM6Pj<%s-T%ZYU1Ha5gZl!Il){_#z8k9_6E)d4agJZGfSJ$Sf zWbC+_eKJSLQJt#%M7?`PsXbwpQZ&5q5H?IID2#n@SSVo-vRTsLU1o8OXl4qx;RwWaigVw+(FTDV z;r1g@N+8FvaI52&a~qm8zntTSqYNTqH!}OeZF0qlg1Q@E=uojmKVv%zDr`{Pqhebo zaDb4q7TT(J>e%-PZ(|2(AkU^@O-LRF>mo%DpACj=b^W?bSbm=yMfb`A&?ERV!^omX znX9LRFn_ocr3S>cluB=7bx=-kHCch7e7XA#kGGF(F1m&Q`%o=r17NK1_OC+hszq^E zPR^=#j6ho?q%Uc{CuY^P3UDKSAP^w9#=(3VFwsp^X1d!Cfhi?wh6Lj#<+>Gn6VHF> z**Kk=kv|N2jXN#+suLxl9u#cWB#kwrK~(P|dWw&yJB!I5@i@_*s+8zdNWJQ!jN}0V z>YV=I0LM{K$ZTdxUEe5Ux8HmOzYNEv-A*#%f!J@6ND$0+0%mbNGktGyeCeIlw-6Dl zakt{G+hax%U0;zM|3J(9C^z{S3m#HXV$gTp&#!Xp1K01Mzip6psfc0ipM003sn!qo zpXYwvD27M2Wq1BWhx>D^xl4;RMU%K4={$>XVN#&f;_WAU2 zHs8myG?M;9mXrd+xFQ#XR>Wno^23KVR}BN|wie(#CeZzkFvZxljzUC)kjUdv5v^y5 zrnuu-k`=+b!LOS`?<&VI=d%B*lrEa^{LGhMXF;-4%;?S_N)P~fnT9-o@b=~Hod$+* zH!@i~5^^6IDF|&@BGb-xvV*a;RGXSxcriqh8cbxP8&c_Y^D z;;O;t#tz+8&Au)Do&8k3i=cx^XSjkxw0hxo8C+uN=a5*+zwuZb8s@g2FzHpE#zp7f zO`LYHF$?cv{HNH)O{UCOw;z5&!>sik=C_^zkyHYJy{n|3xQBLQ20KsfPgCt3;KWj_ z6G?LD-r`Y1m#>_B=HstC&bj)oIb>rnZb7oQd7VLN6Xs4GgtIaBJTA7>*zr4p^(c3x zw0)$)!`7M|O;^?utjRK?Qa&-_N-%c1>_LGWl4f)!0~v``FjYFgsP|#u!ye6zablJh z#nwq%68TCuyBW)r;Kmyry}T>1;)MUUV#SfFn0>qLDhXv2l2~422X1-yTN8UqmU*1j zdP%Wrd)ie^amCzc<2sT)!F#={g@%#^A&OGx{D#CU)!5x;@dO%JCWwviW3Zg%8{HuL@45U83i43f!o^zQ78( zcFeYji8Niayr&^^$+Jy-zMoLcgI@bC!O6({BO@Gf+zcZ=R%Rl}d;fxmuTzkqDYkG4 z8@8Xs*}6FLNkfX|?!J4!udc=PWY2}5h@EY3n4{mLj=W!I9z=iYF|Us1r=h5f88~XL zl1UqIx(Ul6(W_o(D%$FdHoqa=MeeU3edYNjv%ErzdD>s77$L<_EB*YJdZSb=-kOe zxp@6mJ>&8$3=d3k{`?;V8e|1npsQj8nMJy+pr*#R*Y} zi4SE4&rq7n?&oyT%{D0cWd^~)e#^dVuf2y-2G{YW*y4oeA3r~I{kgkIh{3hT;|eB2 zt$T)I1{9S4)1CeKfask5$6L97zR2VerFoZp`C~vj^iXam6V4f#WnP(o<^YnX1r5yT zAM`a0?l@vtHAY-qfbwaLnPz=&FW<;f!MA2HV<8?pFk=5L!rpo8A*x~gLZ!cMJy)PVY#JvS z;?{DvJUD$$dU7-Ce`fY{))kHeIey)tQb0xYzT4l}rqdhZ>hxSG$A2Z=d+s3JH;t(_ zc8~IZ8PK2H8|HS$KO8#f1B&h(rZBF0T13jxyV}C~^gt9d1n!@l`VVflEz>g++^AIf zZDgEvaBXU#JWNdXRvp5f>t@{Q!}%tI<)<0%$6@dEUApR?x;o!(UiQ@>8??D}#d`3C zjLwT+1eyP}Ea8Gwv^kE?d}?Cj11p;jQLi+VYO?~flPg6d8^3r6yuYLs8)P(+?~6Pp z;Ax%f4SIcdzZXA_BD;zaQSsSN48w8tVNY2d$GlthEpVLhq3e2ZEk>Iv61|03c;PO1leOxzvyTRYUStoXhwB+rHm~EdsPSb>WWDk%Ppfcx*%MV%zVqVPS{#}mu=(4 zA<^B3=HeB!B?UTlYi0>LU650u7v&ANgxisgiRA}J(qe{v5s0h4vD=ZM0&piUng9Y1 za?JCbsUgt`Fd#`H_I5PrUh)O#*akpdfpY`5G zY-ivKxW(T5L%oRVMi4=r_V?@5@L?R$JLt=7$`Nbxim=QZ(KtTvWfh0Sbr5cPXE^S5 z7TW9M;(@UAeJMT3&2#m!jASQdY3pIaWc8NJm`>XMIbX`-N{qMpPoX%Y<;NRP^y#@@ zdp#0#A`i}4p=6MJU(XRzhzjdErr-7RSy2*A&-rb)pT{-aR?McF?c$lxdHc-3-LTq@ zI802jK_|0>Nfye^ky^z0Qd>iI<8glX;QeENL;Z}owC~CbG9~u!`r&T+4bm%V=hsGJ z;eVmy^9M6pgZ$=={9QfFAwg9K(7WiX@Y6Ngwka;(rQn4y;bZTZEBfBYZ%t=y{ZBVk zJohEsz5%{GaPd33D$)Rz$jZ4omjc$$gQO6MEuY@i8a~?%l=fMq@Afcq+Ho12vMzR2JLDh4_ z=F#m&>eC@iUu)YGy;C1XWPtcE-z(v`d+pG=O ziKg5ii}m2rv%w;S+C#`nWW~+*L4C}q}>!3EuB|P%X z8sZb*?0UQTN>Kk)Scqupg<=`LKaEN)4?_kuebW2Z6)6YZoGHN3nDWdQ9lvZiW@XD(%qCFGg z8X|1HT%i2*D#yDvowxg7y(i>)N8EviF;4D@>}rs*OI_4}bb0k*g`3z#YYz?H$~%w5ZB)xl`9at?`z4=lkN~w1umi*To9JwE7A3`f)sl zNpPx`y-e$7%P83S{{7j|emiU9m)+|m4H8yA9523BU3+M^&|2$c=QRR-%MNwOWj+IS z`(^cv@P(|0Zq;Cgs)GjpTV)5w_cQo7}=PgLp0{h}ot3rjPe*FeM9@RZR zdOB1spJ}pv%NTuMB=mP%QjtfCz#&Iw3^r(aJ?URc4N&RZYbYLSRn9B<#Mh=-Xxp^5 zoDqre5w=5XdFEk&-@CAGL%&PjrA%JEP9xsCpVfzX+_lBY_U6I zPcMYCeQ+Z+M!2b?qlSkuYP z%hwPrSvjZdWbN}h?}%#WsfKoAmACmO?Xegt=rEy^m#+K7H%Tk`v!KdPFqkx3*I2I+ zi3n2^yK9-Wx`B;9v{l@0rAJIs@z%}GB|Cy+h8gn@J*L)o4+YMZ4TSB|-u4*@mBiKu zV$stHlCaLIwI6yXSChs*o4CE zsF-{iGZeSJB8S(WtwWQBHQ$_kq%C%jB&HKM$~Vk)&C(~v(;slAn32Hb z&0{z0wuX-1iO#&vrQyKiwa(m`H?ovsaldv*H{UM>-={NojV$inME;zi4b3X(>@_j8C4Kzn&nn6$Wty(QSU=nqEPw9 z_2^w80(zRh)%u2NCsVITsK_)q65|2bhN4w2(@LD73vl20swpD#pr*82i{&yWn%%}v2e zIP{}9jYsPko$v8F{zPxp2EoW|g}0|S7BR?$dDAKgzYuj_UXb;CcuhS7N zbyGHQmI=_Esoo$%VYx%&4{kHhj7hCm47mURE|i1uPsOs8m>LqnCHK733gSH0&8-Vv z@IS>)>^}4YhM2-{&Om+@TYmdn?FCcb#&8y)4HT$28uIrfWKd*hEEEkqsAciXRRgI3%q0AjvS{{hPyK@0SmO1$xY6trDX(zy7s7wCUni znB@la@mQFOj;q~9g(NX*;08o(H5@dCRd9z^1gfI zx?l6Xdp+i3NY2ivIP_V|A?k=V%Vn~WRT+QNcy@S;)%lD>SGVT8ZEvi^PEJMl>XNG)ideKmk9BnUY>BYww2BZ> zlCv1QmXmowUo#Dlkt3y8?Z)&9PRQMe+f)v-Tu599otdtP>{j*v= zN}yFaud||#H$cXmF5>`gZXF-vE27KsyA$&~7!(G3rEN+zaU)sRdTze8Wf3!b_-U;8 zzpI__`}8*znm9Z7^t>NihU?dL^6rzPyVoYw&efD9zRB#ce}nW-;?3uE8oTze2J{te zz8>-Vxr|lxivx9yhV1cHk2v_0BZxsk12GB#T2t`!oUXoAIcgCx`WaY2a>aeuk4-W% z_RpWRzsff|9M{>>u9om7-!Yw_4ZIu7CU9Vyj!eOyp8I@Gzp9ST99ku8m|gf;nN4H( zknrQWd@3)!=rbpF(#j|g`C7i^a`Aebq8r_V-j#Rqw_}CJReId)Z)t$X7vM$^AEv#K-_yEc{t!M7CWs}XmQRRM zfkHn}qDqVR<~?9EO7{6f&u2j8##nfHvwffD6A4CMGM@(u4?8!GsYw#2J1H_K^??2% z=MD5XVmbx$EdQK%P?3Bu++CgwTfyA!#$*}o^E*(qIs9)Mx|YCbnn~py60& zk;|+gYw7`RHuYvg3DPHBC8$c(cPCnp9p${c3x!w>II}sXMa|uMmx$cU&?QczEY0Yr zgeFk|gW=6`MDcV(erUqWe0%1b%AnqCF!RYfTs@Wm%)cO1=&~BH4Ws&>Y zOJ{%_7lyaSVp=2lCb^J_+I+u)KA}62%E)fqPd-53 z=~qY}m%2L;_G?sq%TjOD&-Qs{M}MgXiauPJr@#BwD0DQKnjD>znAoEmxN~fS+uGGS zEXS&oJgGY@Z{cE+Tk2d~|NeK@wI?iQ3RJHmXvznbMMslAn zzXzdghv;eRAw z{=B!m0-Y@lnZSJ8id0McPA2#xr|Ql#dZW08&ecA>%0D0G_NXb^e^OQwL{pRB;WP+n zRD2#0uA1|$YPAF8VcYYM52_R>yr!qe47%UR8Ht6F9bBBlw?B<_wtbkxbuh|sN#=B7 z&;PJQCYfrz=NqyL_6=os-K7XSU*)d#3f+yju5B-SdVamwTx$>&;p7KC2N^@kX5)Oq zpDgORX~gwN|0eXI$H#>ut8UDVI^7`m@$I2!>GH<{gE%a)g|(rr_P$PrCI?4n_gYIi zdsA9zk_=M>>aeL24#DoAvCHUpJD#2eSu4_Ug*;tMaq25` zCY7DgP;4w}^bBrm2!Gsg2-yF93HXmi)ntDr+=>sqA8t%5@{O_P&h_ILdT(&WnseKv z=ufmYENmuqh>_0ROi9uNxEaHrY5A~MwEk+fjKZ0onuOsy*Jm(gO0a zE802_%S`?}ba3=ENMs;auJ@F*5z;lG2xx}!>bDhyGr3#&S-#bgJ80YrY0xw927z{3 zD)*P*vlKb~z$e4@jCEH+?6N|SFVy%owC}Wuw7amD7y>k8-p|(@Hubq+JMo;R_A0)S zLv~`@E^@`GQyhID!Rh7X8Dd14Pij=G^f`P!S;}xi6?1?Q8=B&-! zKI%*@@nh9xsRQpFRqKbSN4_*~=;JdTVY%QZy?Onq4L{|xSD*mf=qHv9UP6Z3u`>}W z`a2nt?sVKWe0Fl#QQJ=VUB>;|`Mh?QR#RN<-h^=VUd2Wl8eQJBeLIH;b6ac4wAc2^ zm#L>&#WfaAg4$Mc36mdSqB~=%ms-UkgKV>JXfS30iQqcT96`}NFPm_nWI3UfG%yw# zT7T?T&ik+IPr70b??4G_)T7v>}|1;fxSzll0GsK^=J1xr|dOvAb;7_#({dh$=t~Hy~ ze2Q%=d~GCiKhJ%cC<<3G_~g8C`3fpJmRd#`3LDy5Xe=kY4seRblkQwQtNV??-!OKQ zHn(Mft&U2^9mSeHRJ7;hiwfB#t1dEwEZEc^<$}7!=opjtlld(HWHP-B-Z2LLLl0uA@Ng z`0W&JmbhW4gtjfM?lN+XL)Q}z%~E8YpWX=p$S886^VXKLA>8VcUGod3c#Njj#d0UL z#*7`cuEFjA^QR3B%an*hv7aB?>z&gn?h2o7La6XNp;L6DH9KC-dez7rQYV=J7{~@- zHV#Z^+naaW9b5hT_vZiP3{t<9(V?50FYlOxL04W8U%K|GfrM=jU(;HTW@(RIB+2i) zm*~`hHhLazXhbWgRUV(->6Y3Jp=q>M&gkru_FdkgaWCslH^xStS+zduy0Kfg>`AIO zwyo+HU&q)_?Bj_PJyXaoT@`Y9jlRMWdD6ENL$z-jq!n+o|026LqR;waK%;4FrJ&>i z5Dke74gk1u4cD%duiphw`D}c5J7p)pr~?DK)rK=przN(~z*tyK$rzNSqpNyB#`0PF z?(CkQUm89{EmTi=yA}HmDQ8nni|0P1RpYpttTZ-uYJT@2*3TZ~Ur(W+aqZ-a_lXA( zVwpS|#)!O4@c@m8WyR}=-J2;w$p50o@qDIaxp5psTwtL~}f1#9B*td%xNm+oE$+C&mklA5*3Y8f3mg z)Ab8baiVVyV8jC4(cLfqV~z!)?E~u0XG}o{6{puk%6?BjN{I_Mm9S5X96JwZ5YruN z+xiJaeR1o7r!yV6(ueJx!X*Pg&=vp;#T#k!rbynQ2e_7 zs8{B9U?p!*GKnJe@afe$!Kc*z+n+8%7FFk1Tn%Z^rr5oAnvg_naOCK*ePi&UF;<0r zUE5pN@N-=01&(1W$tI4Ebr(&*CisoiKIc4e)#}EXvSym2Fqgv3xag*v>!xp&pl{f( zx){9gWk=OBZprxk6AIJowg6ARZ}D?4gqioY2}M7CV3>5Z85djguK*OFKXEdN9zJZ@ zIvd7j-?rfBKZ1^nogV66RB_Z zDB$)J194{@MqaWB={+%6v-v=+=vp7N?^ISYTAOhmY6w!&GhpQ<1xc~dm9f3ao6O!? z!d^a@ew+q8+1)e4M7I9<_`2LOMH2apMd7_$E>DkNi>@x)afELgUa0HTC2vL!h7P{y z+MC4Mwb!zf3_0V;9p5#k;@>Om4brkLw4rrY7l*EYqVy-YkXd{5nhG;-q)Pa-QuGHZr(K zS4t8+)>(hFF1Ff$y7pM+K`JR4!}#kr(nh|bR%e$>=?{8-hcMX%Y2~!T6&2s(cPIO9 zA`F$7us--mNSS3^F7HVFX(Y&B4Hg)LK5e8*5F@fb+EMy3L(kcIWY4HLIr!~j{ED~KdI%Vh^86&^EuB%bkHxnxx=I)#&bWJ4@?Tl}T7g^}PvB(?*67MH9OlAa?1VBzFo1ik_T*yx)|(6x08p5DRe?^mdQj4MqvrbJelI&Ln?vVIE z&G3KG*O!rA@ot6gwp{6ih22gV7y5?GK=oPOEN@KvCu3Mfd6=ZQ=X6hAu1CxiU(EGdfilfI&JKY zW)UJRM_E@SrwkbIhU~06cBn@Xe7|*05JXh?ro&GC!IXHk9xWpg4gfn4eJ8q2R+mnN zx-J*?_G#uhZjuQcw%m=XD>40Smb#vSo?53O0kreJB~<9()LN(c)f_Z5ALeR;0+s&( zRVisZGdD*0;9L@88ennℑEaI;~}3%%lODwPO<^yDeMDaBb?@lncA2>9+zzqPGY+ z79TwXmP}DHKU>`F&X0|fkWm6^zrOvc)Cl|p$H|5d-aA}Pm&zI}^o7iP+S4g0^AJ_1 zGvm3z&>n8NFiPu4f+$@dFWh}(GzAaVx6F^fQ$d-~u0?O998&8a+w0}_qxcm>nP9DS z7utuMCp|v1fhbQR_<6qc5c*<^$L3x=pAjFs{?D^>I5Nm0Xmr~<` zT>XZGTkY)>!PpOQ8uPkDv_zu>ifvS)i-XL4@)v8tC{zHeKEr}NB$#;70V$68@(pN{G;w4Mx(t>*V3=zU|N5+~xE zN}A~&eTUx`raU@#P}i0yavk0J?WA}2(-$5?<`x_5-F4sR=%J%Iu0@b5)Zlv{ObRZP5h*@5yrTcG(buDAMj52_t2*(v}7KwSb#g#+I3t=gi z+OyNvhDv3}V1#5@LtFa?aCBYS2{x^m3dbF~^dsq9Yaal4>lL~#=Q(zfi?xcwqTz_r znVOs|Y>@sK-NZThGBbZ8M?bsX&S~@{coXIEVEAd~--9|P|^!@63oq57P zqBq`k$vt8C`;ljYrIC(*a|s}tXJ%Q_Ug z_m3ueMVr&tHGA;c)L3ZrvI@kiu-!{3O_AKQDX94xj`ybL`#V<fZ( zt>lvB`RJbXRd!Zq3f{b*M_Hw$v@dVBD3^lT45OcGpiOh>sJGD1xq!?;LH!*a;}#t)^bI~FWjz<}*1>hTXYbD~p5i}-!J>J6`M`D$ zRxWw>T7%j8dM2J{Tu<9yC+50*kq7xh|Z)alNE%ggqmV z;L0-}?KD?4z~9r(OIMnrv>1X>OT!JNT59d)|J7p{5b8N)d|& zy-3Kvv`E9?E1K*xqq_+Ohp=YqY`uVy(j%LsEJxokDpBD#uI7hz`hbw;PN$bHr49-EZb87b^Rv zdv|%hb=@^Cf!PlFYr=+pePG(R$_9({V}w}Y$sfL|>vhVYQy4QzIxZ}b*Fjrf*a@P1 z31#M^LV8-0hnJl?OQS7rs}XJjh3;X~opx;*k?HRFins2AWwl>91l9-YyLUgF0KY}yLb z5NIGeNi8?_vF#Z2i2JuR?j-yvI-|UU$BK|QkR88gQzm+$6TqfH#;^Jln>vVj<#Qjt zLs*;05q;#QgUP7ZpiQE=ZCF$q`y`lv2~zHfj zy0!wb}&Z#0Zk8iK6 zTI2Zw}Wr@ z`;}IG+}plFBs_z@y`aq5qC6=m0`77t6OI<82H{hmSL$)T)x-3cE7VtZ_}wo{W?AkI zkbXwCwsix~#OfY0_qDGmMxP~ppv9TFz39I{^cj*r;;l{sa~WANvm0eA=pBA@@|NXg z3=8BA#S|u;PoYRYt49WmrT>rku_4XtsqdgzGG+2!yj~)6)_Xg{i{o(2N z@(<9wqGZ|X2fw}i$@}qj@c`n4+In5fhe8SCcA+N~nkkR(O}%U|Go_x2T3z7MKySEt z<^j{r2w1#r6ZU}der0PTDs~ zjI;$P#%zqAsx&?yyGl^T7EKC?h(~ zZ3*Nyesbwhe2B8ehQp%MOUT6|%Aw}XZ^;7OEg76|ucvv# z@;8`x5r6za)pG?7UX~Hd)w3aN(OXQVZH8#}%dg(9{KA2iUhe~A$tP%xY*vZxb5EKY zIU-!1FJG@|nfRcn-p<;87tEX|86sf5yF%=G=i`uaea2q9*6?D5R{5b68VW*2#ZL!`)hif`Pq#%!8dp@5KT zw^b|g1+tpA&c51Go)1KGkPR@HDeLohdho>2EA9f3SKN$s_7-Ey!9dWZ>zM1fwY~+$ z#cxnyN`AFsH*uje>QlzOdJAur3to?q#Rjuw?8=|bUp-41C`rqG#^nFI3I04xH3xEE zFW728(iO3OS8wBuAO{VJoJMN(uDusU|B*7Pv6CwaT%;{fmyW$fY0a6n^_!ctEK1S~ z_f)Huu0U`A)$UsL93*9%=>rv?Q}{7b2_^EE2gh{RvnQU?-E}U$g{ShxE3Xy25wK1y zcF9U_ys+tfM6BQEQU1GjH#ZmGN}EQG=m4X2*8;FvuYnNu$mswls|*Eyu@1|m>j?<> zlvG~5Io5VSOhevq;~OinLCl?s7K{8YPOIIu^?6bE`J3;Gm^IIr)N5~T&YsZ#G zQ$&pQd#qHII7J=z_;i^~ELXc!bZJaJzZum1D$F8+6d_&06MI}GZyJ;W>OhJ+96fk| ziVe_L zBPeVQAdkCX!_iF-Osikwh<(AegQ17f)<@={cHr}MfH9FLa;3Xd8qd}n`QSQ3y?o-% zz_nYJ)m2|X21$h~QeD)D9eQfxLuD8V5$ou@g@PcnQ4Dkmb_njGSQ{5PN5y)8Gj|_x# z^=8cNR5D))DTe-#=cIZRM-TM|kzE^%?>5M4ztx`XKbDsE6Pk+kFmR8JY|^Q>-wH_S z+sC&Zpxz3WE2A&7Z5k-Wa28$r(6cL+Bjj-^1@0ry$+i#)qqIfHM3AUwya*d)qE{z> zm8_q=oZxwcpI6tJ(OgV}Y2a02cTaMs&umK<%7 zE|g5#Ug6GCwuJ~#wQaTDrRnFQgP5)g3KC{uV>50$^kxl5mV^N8&3x% zw`vyE+~-7db$U&>o|U{jedg6CTGKaC-Qg*?Mlp}QETwBTOgi89lxrsni59$266Dd&8_mg&=@bZxL{99g*6ijs5$AgvjYJ~)wA^y z(m}|Ynu(=)IPm}Ydh4jH)-PI^ctN^DLZrJ(B$O6Gx+SE$OAu5*IuwxZkWd-in->Rb0Y9-qBE3S>?W2sZ9cyQYzvYf z{e>W-iEaQp@8ed>KjXP?23#5)Z&|T9z>{o6E4X~foRkm6lX(m`Bk0@`XzJHv2AsZu zHca3RUD43G^~33%{>~5QeCnA8sd)?re9p;B#7(cmmatQ1v8>?fC$}t@GHwno> zpJuLrP>sqYHZ_AsdY3tZ3G7+U$!dBjw`s>EdDysGQt7RH+ql#8GI6JA?d~!zNpy9hObLtgU>*J#1#^j-5y!nD`Wz9k zsEK@K&+Hs{=Gv$>E($e4ff)vKvrn53mT)DR3-0T3Jfo`KmE$qQqxfM2aHPO=HI4d^ zTH@rl$Z0Us(kdo;t0Ls@M0@nlVM0d849;e;tajtPM6bT7doqVjYp# z)Tkq_w6o#IC`6Qvs|m}Ildv4?vtmlo-7n`?UvL=G6ij5T*iLx6lmuFM{LWdj-$_(zv*f<> z-ZN%QpvCBmF0f%7!oizZP{;pVUx4VWF_M#M$>&0A(f?F7&xY-wFRWtn_rfLnHq6Uj z&0iD3TDU>Y`%7$&fb0OYPbew6d9SqH1aT*u|EljqkzznZO)3qsdNuV~oA((oJ-PPc zAcDh=a*!UPjyA4Vw9AdP^W&KUSLy;#?&iiDK{p4K-^_1XIZy=F0?NH_4gwe1ePzk} z4$-VUdgbPhY|fWpNT8p51xtGnQl3>8H}eK`O(j3f>#$#C$0O~M)R6BTY(fyXPR!*r z586>Wni3Y_#?HVm`ZJvA%b); z0egPK5xES&@;xwMQ+p_~3t-%i-^jfRl%I%%&uRju!B!KCr^sZJr!@dFHs`k3Qp=z? zpOC1~snI>p0J_K8nyP?B>q2^>h)cNK(pw?#=rLdi+?4boH*C<5kEoCuzyK4;j$JAR zOc;i%5QbDdT)XmbGGZ7LBJBt%Lh2qih!#C}u4=XeC>Qs!x;O?BcS*EZ3jieEMCN`6 z3$f5XIfiTLuQ2PLbnz}KLIh+CK=0Br}jBDI=OPETG`uJ|9L z`T=b^BRKS^>B7pVwY$j(5oqr!fMU8wA)Pb<) z(_Va!R!p@P7als*p+tR{Y*Z^8J-bUXnl!0!_+c|l%J!y+zQFSpen=>=e;@E7t#9!b z3|$a>{IT4p*Ry(6VVl1SorGL~A5}D`2Y!EK!cgpB z?`PTXF}T;tZAy7VAs+(9jX`xXOib3Vp$ysFPV+G1)VWCU`}@h7l1T9J8@yuvhlUCI zjQ3kXF-xFjwGiYWX6_@tYJdQI21(vO0*9RvA#gQ%17!1{TEY?l2g*XQTgb9HS%P|~ ziS5bn2UdnkZp(+q+^_?HtMS)2jj(PU-D11{>+K!kPCg_Mm4XeS+c@d}W11l|W|%xz zR?GpjD;LHnG?oD0f{k$i%<#cZ>OzrXFbtNX-dODa!eq|rjIzW?AGy`fTjeRz$G{v9a6b*T`!FPIp zU7=r(S#}Q>s4L`OL=kCasFMa)LBaizG$DAFnSf>z{1!-qa|Z7xBmd{z)X$h9kNdd5kvyDq~`%z3kf*^hjjthzjsWky&HFe z_xk2LP`ju-r?khJrCjq4a4Ue2g_`2c0@o*nnmcitOo?2cZ~7zbbVTyjmOX6?1k`?9 z{(=m+hh!PRx%Q2%dLLO29O3Am2RVX={94#tL1Vgi}y zgW{}o;2kv7LUu4l84Ne=;DyZZYr|dxuw`^1fqA$N|E2_&Fz0KT-;i^5KLjRU>eD{{ahJ|T_Wz!C8C+=?3*nur;=lz3O3_^+@U);PhjFAS(%5GO&sOmp z)wDV!%ZqfmVUu%ch%UlN+8#1cN&0rEIT|6nrTIo07CNM2g20c!Fz>)&qG2#N^&DWy zaLhmN#U>Wp9SAerI+%|$>wxr;Qj2fl2XRIl*G7FTzEi_Al!KP%Ag=2f|T!I@tfBrm5UUG?Da_J7G>VqYFzzeoJtnYEybi1W|4;kNjBF zw5ZhY!*&G+7tg&mSOQcR`OwHzLWPd;kRt)8eozyyT?;Hj6bnH%>hHXpED0Hh1sF|H z`~$Q_(T2cC0A|9cKzYYQ*#%x9jA#F73DzpK-m0nU)qz>jgF@6RZKvJ@CI|v?Ilgsd zzaCAm2S(p!<($dBFxVqh8JYh5L51=N0|mg+A{xAN{|8TO69?QH)oWWoEz7yFsQ7)b;Xt<( z{VKlDFrqI(KI)VzS;zV3*ed49LK6+N+l{yxFiQdwBEVJyBf5yzC6Er+rN_|8A(WOw z9_%A|l&;gKf7FgIzZvJ1ynDN2N| zABMKIR!X`MuiLC zL6IF3y;z`WwJ>(?zyf}h-AiECzKFYyU|{w}zcDc6AC=vcmsQUYR(hz1C>^Pk96-Ur z?5Irl`#sj9*sr7y16w!n2i}P|lmUPBrnFgMI~25wVqE?RLj5YUfsxM{7BZWEUh^~s z`X%>abEx{_u2SxHvm-t<)uz?f^%=B>>&5IsG zQx08BjU6C%;%-_wiM@r>33SvObFCpj-h?vfQZl0K z0W-)^c8zd|Hk>;}H9uhyL7w#*Q2|ko#aa*%vVC>+71SNcv0uQgT?(QXoM5hA`y5{b zpQ+86Moo^S0S(qw8_=0PVM{YDjHbVsx~kC!YyTkk2VjYnj}LM*!NBd3><0&ylf?UL zt2Ofja`_N^@$mRH?|{lw`?`@f9tERmE;cM*i2kN1jxg7NA2h_=T_`FdyOZgG$yq2uBtgoIvD<|0?d_H~LA2Y&$MH z4ud6<~>S*)$2Z^ktiA@0~e+Qacbz`zZJ#AL%>t-?M3HmpFLu?&@j{7nTVW5Ii;h2)7p z*a*1MhM}ftcsTA<EjMCx|LJgIuCw3jn* zg}21@;q*KOaZaT6X4@3+BcSe_@)H@pVZ*-JL=#7MZWGdnklMH$G21)PKYsSJ8WyQ7 zAKCPp!-x+WTW~Rd&v{1f`UtGC=$0OVY2||6L(bcnIT>GOo1s$;jSql1H>FfS6u~Ox z+bi#%TSttpVkDmLL|9OFE-?f=fP$%HvOwX~;p_AT`0d|;%7E@3p6YB>kVLyMtQ9p^ z`_rkg7@jo?melqwITWu+k<3_*uoQj&^!i&an;7&^Q~agl2XCueRGts|a07O%?`}t# zqC+G;i)|iXcM2}&Tjh(c)1uAcWo5ArP))R;mRuK}&#z9q59fz8z2fZJQ$lLn(BLZb z3;zjaROm-4`?u17Tj@}TT_A)UPBaWJr#GYBkeC9u#DTFitotnZh29@2m?YNxVJ$?Z z@TAWa6g+P5q}p8i%_jtUHfS!*z6<<3UY?d&u=+nGb@-k}ER2}WQ&ySo&zR&$pW15D zAU(>OtSl*i_IM0?Q>XIRzPGLseU%vF9Xe$npPwth) zc;)d5@498tUr{_jGc;0P;pK*Cfmn)-DZLpP!OYb&GjCbAXo#jW(Tp8Q)=$rK zY1lFb+S-0iwtaTy>IDmPeJ+LoDCgJxx7#+gJ z<103x-WbO4QdJoPv;CP|4@kT~GiaQH3(MFzZ$~CRt9)G@QPAGLxwunaOvY{s>#BZf z@HyV{vvcum1ix1%x(LbSu~=gN*`V!^mYIjdp+E52VJT~mqKowwvD;b84UqED1Emq$ zH#pb7pI?LyVh0?u6+yf3A9lY8#bAPzIl2ogn3N~@-gEEGW_;) zq4^)~!3vCN*Re%bsmcd_QjndGYDA+ikO!Qs6M}>&H@b{rQu!P3Z7u6j2ER#dyfKfC z&yLnm$7M$`&eQ%q1t=0NycM{cNcn_7);w=0UQ*$uhyDf_YH)j-o#8{(kqtXL9#VeB zdTbW^pGg3UF+Y8joc+qMf{VRC%qu=}qf3{Cyvc^E3z(H+(x3N}Dq%O3oaS96O|5M) z5cKV_W@@T_sK1X{X$ece%GM|>F(9@grsJScX@9lfO$mbEH6t^-*ykmCpt9b2g?AN1Kr9<)jy43ob@ z;~(xk44?=O%ixc@Bn5{}?$@fnK>bNXW$T<+8q!!?nHU)D7QFK8Nx6I-j)ZFQViT|* zpF8r5U0Le7cB$=Le;Y(IU$-CA3;5Q+`QC{35sQ8aM7eM4Q$=Isewg;6Pq~d!t)q#` zO?9*FRKGD@#}7#1B(@2`z{>d2d_H^5LD>za@7`! zR@|ithGJb_HRry71LW%4r{^~Ee=pVrGYNudE&$5h>7xL6CG(H2o+*G?6%_rc>TiH+ zNqJG3m_O&Ar2}B%z^VWIycgPquaWFn@7I3+s8?+HKPqQCUjm+X!_5PFZuUm0f6LS} zl2D`eNCH6scSz?H5j`jQMsMZSoJ|Vyh+y99w(p(YFO76=SOr%wu9g@G*x2H^#E|0Q zeXmOQjG5S~HRZt#m1a@*)<*3@(BWXmp+mVFKC&Op+-ITVn`B-}1@oNs)Lh|bY?$nXy;K|{K z&!tSmNRHu??=cZX;YO zPH2O78^%sc@JJqvJS+@Qcf|`EYvS}T9gY6R0u4e(!y#}f_z3u^k7=4BB&v7rnCWEyU$NKDZlaIm1Mm71* z&z|?{tbdgza1F|Ibs;agO{%(F!)d=!rAU+J3y`&oV>PH(faAo*V;0O@_sV+E^I#-a z?>V2?O|eezrlEx4TC;n^fhrX!%2`%&@DWs{e;ZeF9t`oI?b zthvNTjLSM&N5O&l)kK~9Uo8Oj0+SGF?_}eyY%)94vgk0AI06+ivf!@XYCH8$*93=; zi#o-V7kw%;n=(q1Tf$-{$vn+2z2Qb5O{v{ds_r(5FK4Oq)Rq)g3 zOA)u0>pd*=n8U4+YDi#3uf}@p5~c7QS}~iv(|FyiyGN|2l4Yt~*UK%I3ixMh7Q8TV z&SjqSdiI7o2)c*)=@SGe%M^QT$a4tP6g_{h5ByM6d(uY6)TJ^20hzfeFz{9WPvGjp zY$)#z5LadyzhPH2J{OuIju#;JbEhjx#Px@cBRi!RUw)GWG*o)mXJ(X|pW23O)Duvi zoC|bkIO%HYKDqt6cIzpL?LAfbsrx!%Q7#WFir1&)qj*#5`d?70D*6>9bEcT7B-cgU zHLJ^(`1s>zZotpYp07{sW_NSHeVMwJckxqZ;O5u!-v$C#UQ{oG9IWrxOOYUdPZX4| zF5VJS7y#$G#z>`tHE;oj7oRf~ilqWQv&hMlLHD|4n#`Vdf{Wk8M+FDNMWZaziH7C- zo)Hn4%fD|3?FeLlz%vV6p>;eLaUM_;lwl~QQoGX`h_~!-a{l{f!y^islcR>Y%N#ir zuh*Gty(lA7W$z@k;cZe!mb<^k2FQoc<+EbI@u7t%=1!ZMfQohAhw>(;FDkLV>g$pAxVKw zO7i!MGY;(nc9ajzwWBr}OX#@q%hj5dWKo|OLKNN4O0Cym9=mK?%Q;+?wig8uV>4tV z3jS8{|NZ48i2gIk+A+AH{0wI7?5gpC_$<3>l{3oko#!RGdYZ})AOplA5T;Tr)faJT zyJ`RSENyy%##Rs@JglTAN6mc;PjdWVnr|yIw@!cQSe(raStAAzDbB9pE8E!^L8^u@ zphJ-BFr!~v;V&*EMi4Q3^maeWAf1JLuQNhslfX1{`2gwpb{D1LWU^0283jrGqblzi5 zR@%E}ENVvwprfu!|9|flUbzz|x^!|8V<6adPaiKveFx9t7BEh7DO{3NLMxs}njfM= z;G8!^Mho9)WjgbGziITFhj?1>isapd4&%Ka0Bd~Sd)zU_IBdeq0bZ<<4;ZDPy=}&z zo^(l!Gx-LQrE}C3ub?s~@+*>0?8+QyPQLRW?_YJU3|ATkOdxl5509rJSO$%W%cTMK zbmr4PlsxE7Ki7$g^CJNx<&=tjZq zCFHx1dgr4jxWr9h=CA$Xqx@tiVmvj)kkc-8Ax+HY2gI$%lU~3r>oe%uXQRt3NMf9n zW*Wg=51FR0&$rUVqUo=P_BLE0452|XP_d4p+qWt#gFRr*6C0HZ5)m|`W@kh>ht=*b zhm~4Kap!H@pL!~m5s3NSJV^+Cn5JYFvoX9lRUBS3w#rA^|9fvFA=8(FoBS0Q7C*ou zO!$b*!Q36xU}vff!xH~eA|nQ~77y;+80+GrRp_bS0qWCEyY$6Eo9EC$iy8j*|NZ3~ zf`-?#o_~I83k(#A=$v4ELd}dZU#J_G-m{YwwiyZ6R=0>~gjtnF(~b=2cJcv%FK|(w z2fnR`#;_-!rfE2Dmyk*m2o?aPDD+K0FCt1|%Y`_-13y?ixsPpb=><^dCf@!4b&3=o zZ>0yYcR#Y}If}aRX|>wk5nnfq&aJ7fWgl7Cm^yo1ax_}MplvI;m;7-G&kj()%s`1o z*XhUW+sAj@zQL|VF#ZWBt1jHSFk&96_IlG?Rc0UX$I3T1_b+8=E0j8#Y=aJEpTnzQ z?smXIdf4#0fX}L6<<{VOh9cpfIU)i{j7$>e3(Z%XHx=xbO8YwemA+<=@8yp zin1^*GKsG}%nrbDT)tKzupnlCWnQ^qg(RJDTSU<#nzsx3hU-S2i;)A0BJduF>sCWZ zlunWY9Aaw-l`B z7d)f;jx0+mzi62NiTVW2g1aQL%|Wqz4e8*^AUX@!FY4nF7yrr5I#n+($QN=hMs&ns z(48}Qe)@g{uj+PzmzC~lS$s?IWi(Q>?CZM;7jdBd9m?Y9dsksB;%q1T?w;0VHk^AG zzL&XT^@o7100dOT%Q)TA=-OAk^l%T-G~wqQYxdAbXXP^W8#6nSm|!m6=goL^-!l%3 zBhxEA-kuyyK@{%cZIC?eALe3;VQ<&U6^E+GXan^0%~1*IlSLqX;rTAydkKa3e2tc6 zyx2sIQWcD_AQnnhr(Wzk`Z*P<)GvL?#rR$84r3kH+85aT3(Y6r3tpXhR(DzL|GvpT-|gRjko80j%Xb^5 z?H*sSY)1w8s^>^PX_C}`q)g73AF}ase6DdVG0xz+iVX|)hm(21qB&^=%a5`N@o~0A zE=?hghk(2N04DQH32WcLcITe|0ydqC1$mBjVBjNW8^0$mPnupUyq_w0(4X!~4@M~B zuK^AFWcqgIu^iCUd=s3*f{iJNtK!eiN(8z;j`M_O&1ia3QiQuClG!N2!#PX=#jH2k zp~6lf3j{d@N!ncm%#(=250k2i23E?&tOD_3#wl@iIn#3t9Wz>_lVjCG;dPj6+|C>d09Jb{ZO;`n17y) zEFXs~ssjq_Awsf9!l^qPEp06R%r}vX>{)EIb z6vjd3)o1>mY@!d?|L<3WQ2;#@&DwF!C(+aT1Iz?qX1>H%MermpSa-6YkjX?G5KWTD*iV$aJhO50WD3!fO=L`=pNahN-jSX z=Sr41;U(ks;h@9iG|I8CNB@gq{{%DmLr5qN@AJJk{zID|UcTfB|MUrr1DD8rjh^DV zJRTGW8X}Q(=ZL>4u7sqs0!_uUFMTe9VLCM7w@-e2K%D&suYBGYjE!qNfSP$advA(? zy>IATfK@K1=fmREI(ttMcVZsB?wn_%o&ur7O4&5sP<_358c!!ss+W`d^3f3ZVx0gs z-{FLj^F#jTp?m#^vJvglMzeKSkrxEPA(K8^k9}g((N0-(xDY0gOBAL6owbxAC90II zNB?YD3A>9;W{+v`Qoc=E-vdrvlaA3C3Hclxz=e=jzKO^7nr;Dw;Y}{ZXC-@0Rh9J| zo)CQjEvRJYEQzt~BPQ8I?jWKtA~h(UJy5BBJ^v}BNkY6xEfVoJsvwwshvP%+t2}=V zsyTDh$6<=Na5Kb0Tfw1EE*Rt3NJMo@&?)EC(UyQr*3h4eB8x@Zx;qM9_sS~EFpgb^ zHEsVpJXX#%q2EiK{;xInCy5Zmu?>{PF0Q0o^F{)Sb_=vb>e)|ax>)Q($2N5rq zlGNIpjM|+b9!L;`el4AamY&kkR4BWTGYB9BjrbSPhST&}n<&9QM;`$v0<-7D5rwtYWE^4&BC;5qU{j7^B zsW9;&ur9|lj>2w$Qt$55lW#rYFcV_?MCHld~SBYsb7~V@7h#T2-Lz@t+DxOBn-+{@si&-efyJVc5aO9~3`1#>;X_ zpY8uvWofzM&K2XUVIDu(EZ(sgWO~t3asP*t6w0^42V-bp4$5N0GS}=$m_+L_E34ny z8@Ar5aJZ-Aw#rPi#Wv@CXg`5Xh{>!@ak=eB*TOvlVAy!2Pz?5D(jeMV`k#qnYfgKL zQQYu{2LObv+3-|uw%gr8EnHVcZ@(QkO+(|ua<&U*?*LEa?6XICw~sScHyhfaFq>eY z;&2CD8!(Akd)K@R@j68Z(@(DYcQxZOBxfDA(I zBc_ns1|1h3sxfPwM@^c&0G~VP%(^s0TC{+6AOofQ6-va35EYQv5Q!j7G8Fa-90m>Ms~*R((ziUhJTQ;e`+`-hQQ zZ!K|9JTJkW-o~0FSmmEC_1}LeBSLvN9Dykac&1!v5Lr#$YM623teRAO^&~|WeDx}b zzXC#9)W3usyK`AZ?puq*nH`u%%IJ~d`d3`%GA7DtL4dcqQHGVYMYgi+Njr+77XP{+5zA$ zqI2#!OrOq8+1%$<0w0V!ASQiJ2WFyv zhoMfiaQ8JJJRq*OV1jf3$SbGW3u%>9VR9+_Vj)?g9>}L5H{E`GgZ-BjqtckSH!wIi zrp|**eEQXkOPLpBeT>)H9z$d80EFlJgAvk!`z~2>bvR=f7-6@&4rAzgp)U7m-ht|+ z#petJ8NG8D<&_PzI|^piK!UvvB`>L^H&CZQHlKU*oGCW){~#rf>@s<4cQB)LgW1nk zQum$`5HUwmahVN<1g*qI{b&U7*$W^ZqUv^aF#4YykNi7db}fLd5k4GJTjkhnHGv)j z(i5pCo0{O#1DV0kf!AiLR{~b75VQ{#6{N&+dJl&Ax~mIHt0K86HuBpobM0B@MuO zmQJb`c7eQPw26D&-4_|L-83UJ5DAWfQ5;9C0R(k^;9GVox#Sk*5^OB z>Q`Ggo`B}w53qaX=SlEVSsoyUmo`C!af#kD>sE(NX=RS#s&Ph zRdJltMj5Oyt%(hdZ7Bndylru*t#N*dMv(r^$ad z-^9xJ*=b9WB6a)o(BCqR6_1_ zuzQHaI@g%rMSTc2)13HAf z*-hte>YbtI_4*7CI~~z6{>GiG@zvFH;F?^V1W?9MuI;WUJ^0HC_u}xE%INzAAg)v3 z00Wk$t=OBuNI}@iu(2}K&@Spd>$g5dF*WFo^KrZ|?(GWWc7u}M8ev2%;zRnPdG_))vv7nfQGOVztYf~(79bBJu zdFI5?CPgQ=&s{;61_DFj@HMMxoz-%E|mp z^45UQ5IR8r)#^udCKq>U7gx&k4OT-F43~FCdCP4fTd;?*v&PV}!LF?Gnp2{*e4suK zOQ8W!9^`NnMdE;gQf;J|A-hi4Zm&7&>_8ZO2rP~~r8u*~Jh#FzrTP*@=UPu{QrLWz z^~`8aiosPb8(Bw^4^i}F+MV2^ML8dUd6Q5nj8!23qlzm%sgOTsHi4D6ZmB#a*PZY% z>c%^1`=ps3!|q(pEC`i@%vu3JU?u;C}aV zH&Y~iyUtZmmfi{41%ZaC0b!9E(LWD!OakNdNDk~SHO#CN?-)4#^N^WBpnaFBuhPOs z5C4@c{;672SZfA^`{%TO6vpy<*#j&x2}I$iib(L5Fy*jQI^7Sg4zu%yg?sBmXtQlf zbNuiEVn_rHYKtL{(L3tRboxv+``K2t%+alNl^;&M_nR5qA10j0?PGh&49nk#lAkN? z^E`fY2K@>~cJwX|pm&rZ2gk22X%h1<`AB`s5Jy|G=fSm(DR%Uwd0ilzh z_B6>Z+NMEsA*cM4&fZupaQH}``?ev88R?NL{AUOz&|a04_5+89PoW_IFZ`EDEHJ1W zBA{KLV)!*zNI~^&qbgENMa!;Uwg4v*H`Asoeq6u0Be?1{P&S?%oSe*ndvN&!4ktYn zcLrMbtplb>&z>``Pm%QdSps23s3-G6gUg9dbps&|hhDcd^BB{l8iDH!sR3tsH zlROu=*#6cjVjO~Q?|g+%%Xxc*LjE-Ka5Ep_TC<$-J=rnBsfvOM93()rnpC*Cn<~@Xh1<=bu zeG8y8u860)vF7%^-q*x4^+-9>{C(x1##8*(&jZtp#WjZn%}mi(C0nrSvajW3ji#l zg(>P?7c@sQ_(*<%GAhCDXuO2$A9U3C^uv#|N0CSlgqyHp^zy=n5omGiL9Gg_(sgPf zNhzP7`~U_LlYtVe`F9LVT*aO_*I?(=*{O;P>!Q-QeQ7eg`G18fEd|^wc^7ihzclRn zYq7Z9Q%{8V2O&)B2@1#7f-tuc4$Y&DdsnIdgCS(f2U8xye8idz?{XEptbtYf4;a%-7rrmHtot2v9PJ__kFcXPj5CHZ+h7k`$tj*9? z3(PwL4av+Suv@lny$=P1{oo<=6Zlxstg^g`9!4?4;0$U)e+IWg*F`nvCyehWABV}- zMo!t@GIp_hR$w)5r>kR=iw2-%1mw66Q+$(;k^01m5IuutEK}`a$P&k#ITe^v5B{`G zR`@$w$zQ;=Fcg6m9YED5#m1pK#piEoW&5OsQe@0QbMBOwRf+a|B`MbL1WyK!ngx}5 zUrSwu@_fgd*b5>6)Ho-3d(*7&J{7nqcObx~2WDdaT;1gmj{GD2L3 zJhi}b$;}0xO9LhKW@|3DUiZ!kbbIuwBd%_*DBL}nIEWTqg0bF2vylx^vHGk}k_sd= z^g@ktzpj$r%gXD{a$GF6C%DB*s^unJg9jt)%FHNoXWB3_{aI;(LaT? zRx^tK`JT>Ru37L)aPcE&9Sff=e|PJl7-^X$3P&qp(t7q5qSyy~zdBiEUJZlLsLhmg z-RQsnoCT+%wwrb$fGl$Ti3_-dixuvEdVoZnu`?JG1Hme}ZX!=NJ%&~On{p*E0I*`1 zeWEG3#%e*hFgw63EdIo*epCM9~B8Da`ELhszC6gU=7hMFs$1FESBLyzQ2C%qBDRtXqo zyXd>TG6U|ExRRGLMr*yL4Cgoq2*I#);^EJ!=_bi5E@K2P3rHe)pT)#5N^5JLTKc|9<@3p{?Mg658QtDB|`2)ByF;$~rRD(D-IeZ?!%3S<^}NthE;f z(@T4X;9@x#jE`L{jS(B5Fe_ z7EmyI3Z2sm<-i(3#J?8@DCb)|e6wSu-3XhVhOAhNF$rl!k@+^6NdE1Y3o$(Gx!n9a zz0jl6eSIZ?)xY7yj zyE(1r<^|KXAt-LY%NDZk2EN=L%vQI_I?7)ovs&LV@Bv0hu?$YleFj>R-I<&nJAGYO z4}l)8nn262!?_6!abAMHV%Zcdc;S>M*ZQi(F-usma2s$U*08hg2jgMf5f7vn4(FMO ztJ&jn=8Pxm`IGxNf$VcCOJtHv9tTr``(qkf`D$gubc}=Z&x8sHCN$;ThT`nF+Kn}& zij~{I>oI-iFGS^=jw@_qPU;^56n!zSwLwIftu22->5^5PADIrZ0>4$>BFynt-duX` zQ5zyruTBzE4zpM7p=XfnKb)+@fT+PSGM=olK!2RG=~~l<{p_8HAHk^F2yv11J5_)= zmJJF|0Kc25>*w#UtOtggHwPj#M)^rMhAso27Dupfl`@9M3$WQ;yj~X@VfVUM?50*P zu6%hxx3eIzcKbH0zacK#S_O-PNs@>jfI4N=^7qsD?^USNrRx!CSy+0Zwz@I|y<@DT zWB3fp4LlAeLGf?Z`-V*O&>PDVN$O>o{#Zx~%>*>NL(ppU-KD=F zf!jasX7wG)cC2oX7$L9tcFO_`rjO}%1r$+?-AoF%fHWTP);>aHCcy5?lh($v*kCF= z?2u4crQ&vfV|o{0wcRmW3=b)iB;v<`D+HSF*RP-YhB1Pzunc_%*N6BKrg1N&CPLh> z9s!~pR)3oNF_-wi;oLL*=pm!0lieEUwe?w=-}L8;BDNox#`_pQ3&&K3cv>`_V~$jhTd_ zbu>7QCkPu^Hkx0DPdWHasx0g zmJh_2W?t^cYp@`}C?r!OxLEqdY6m9(**_Q&i2vVToKa|-Z$_@jp4)?tm8t%;mEq;vktwzUDY%?4~rca!vH`0vun4Y``F}EuZ@~jy@9#-fA{gG`xNrBkZZ5DhZa6*Y-n zseIgr=jFt(HgH5mwYFum`t4$E-hDCyV(X4iu??%B|F*?gcdq{eZq`}8AU)mTOrX@W zsb1o=FAR)GW$AAhH(|9-4llHpmSGF`$FVe+ba^n-@y0d+S#$FF?y6t5yxl_{Y)ejH z-Gn|uX5&{7=XB?^qLNk?B3R|J!@LAf_Dnd>9=5A70q#N3t(G=-JXKxJnu2`FbFvFm zb3&oV5q$gzok5)3BC&Dd*niK)re~DnIo;2z7V7DS@eC;ou%NkXl4NdTJoJU2D>tq& za!;smv_q3fz}%SDxAQ zKEdhqvuzDVf2Kc)2Pnan-PCNuj|?+pF4A>+MvA$y{4KpYnc}XrDHi?-4(ws<-(cZj zfG=T^WGmKVPQBmYbXq`{%_9r-M5D{9%t{BmuN~;+4Feu3H8T~rA8>6>L0wQX<7-h9 z)^cK$YQeJ$3&4$L#yXdR5YsWRuhWX6zgm+Njz!xs`H) zI|rT9{^85&Zx1DivF-#5$-++50f9zEC)+G^1_g{58jOVmFl=8QpUU+TP8(>uE*7Z} zckF6>!K#CcXFJ91F;mf$yU0?~p~u40b4Mk12HSLfGq?@qohON~jvVIqxf z>RQ;~RzmTHbdm%bn)#*-vKXU2;h@K%9VWJK@+|rrPjv>1Slii>|M$?7gy1m;>ry^l zQc8b<=cF?Jr|?H(mAuYO>3Sq(-+Dm zn{!vK-+g_&$J~iExcPj<4;L%fM~pAo#O0Pva{e1B@*za6wSo5*F?=8i;u?y}=&`km zF%3)%VXlL6VBGBBnM);fs{Izr7&&$JT^e(i#elx_g?B$YK0(PYV0{#=f1V{C*)*|J zyKFNgp%f$~q+_GUgP`ikncg|_=mMMV;|~v)x#`}FT(7&icr{X2gE1*$FvCnak;}G0 zDv-76h4WhmFBWR2IMT1=#k`_zF8ZD{zARE@B0~}^xg3TuyCmv!iKgdMpy|)W2 zuYKYwiis~~?0eeed+F@C=l7q{lT_lmPapzfH-^Ywe<|uO$F{sz6R%c{*Yn!pWD!3I zR*?dJtvUG3IX$_t;t%J8?)jv{EK!{cp_nK z#ZTZLlm1Lc6U>>Umm~rTBO8Q&jx?r^3EfFS}VcKvd;q? zG=^u1PBV4lPg95vWpKZ{6j0j27f$VJPNR8#nZiZHQ4MpblE5?F5@4_ z_@9o6I|}XYHQh=A5p_pZ*PcS|PyfvR&sk#6lRQ%CXv|plgaD%*Pe?ax?ch@Nk$(7P zMGqSi-LztUOBRhqL~INJLlJ)HT~+>;F@G!mGQIr)5RZ^&lYDyPoR~FJ@TqU1E@*|v z+Y{}WnMP_fPa|&$UM)2iCeK7uZ5F*r?%bLpv1JZiq8FV6Z0yG%D^k|5tDEVjHGg$p z9+}J`x~6K?(UF`w*3+vE<~rD+<`+~y8S|neGireQzDQ;nc1(&#CJChyfGkjkdIb*6 zk5k>b<#sOgLg7C4(Hg-K=StZ5WtK^F=`jXKB^lW?CBjE<@_OL43@R8)Z{`K8WIz=f zEH9px6)0_poi!A$5HA%gQyw_DT!hng(?q}JCs4Mg?{Vg~A_ngQI;+NdtU@`j^&bH% zU}ra%2bsU8A&8yE^2(&a)kQ<<97P; zM}f`Mjuk*g#4^#ebNwk19YmbiSH$GCNrm&uXF5!#@H~!R|u~ zBW1jn*6-H-?_Wj*vjsnOYxs(ox@1KK4G8ss+A$lycBEKrYo>G6tK6p>BB^NW01tV+ z3d%1q{st>UHB2d+&4490btx*%hmmU>^jtjFt|HRADnkvr6X0wngc;Qs#cay2ndWb& zn79TLyyzZc&#ws~f)pU?tei2N^XN-QrvGycQ3}Qp3pp9yo#}9w0dqYb?ch!netc1a zd0DWxy@?I%+YXh5II7vEJ8BV!99RG=0^B5=%QZn9h_8APT*29F! zS7CQEPlu|5_+1L3o*}W)jqawZkptyeSqkbn(M+Z`YG+r{Yud3?3Cr``-4B~uxK!Hv z1s&uUtF9hFnkMOF+$^Qn7rDjYaYdt^mVUPAZPfjhCUHSxV+L1H5WXhBYF@N?K&CSu>i)Qo}8-t@ulo4iRHQG=0eqEi24Ygd<)-udpC%Ww-0gcOdV z@{ooz{HGjaoZh=`{%oZK7{D-*8)A2D27vj|vOZa*o6`yztQhid=FmR5)uMXVC5Nr= z;PE9}4_H2{VlXN=d#}nH5NWM=rR6h`JAU5GOE1Cse%OFGGEvqh*n%Qtx;Fko2v*I^ z_@qNo{OPoErY$C?7f_!2CdbEGnPu&wTU`4Fqh<1P`iL3-Z@c$AAACKdK@77c!|Xbd zjEDQsb6wf{_5o0&&L+o#j}q7Iy}I3Np;xxQAYI%sln>Gf`dpb?u z0qu$FF`Hjb4Ck{1%56 z)|1)^3u{VmUSHUsv<507QGowBw)~Wv72&#xvSWhw589SnlV%;e?_eC=$}Rhu?6Ec< z{haswqNVJC6>EB&GaJ3t?l4#}c=RgGubm+hIswuaLQ6bRuZgQ67XMcFfyh6Av zP4ks~_oo{T_cbKB#$KICJAL+!aVI&8jlvx6z=peS=zw6e&W=C6>31NiUTk?7kZz$7 zArRu8xATLgDB1Ky;*DeQ}pO(B^p&AYC2Y*h4W9NpX?VCCsbEd+>N>0_)_G(CMp4#t#dcx&~nm|ZG`nI6fNg|V&hsue1z=7?S@vVACR z$mT|rW*gUf&qq&dAl6Rm6Hh`oDGF;K(#}y zIQwNWqlkjqr1iK;0n^v-R)UK$Lxum&$MK%8cqv-B$Ks_ahjKBGmF5|o3A6Z;9-h+Y zn2k3jPc-+JzCFTqdbXMU^f65HyFQye)7T5nkmtX6aocFBTO^VKZJPNfNrdh)+3-Q5 z`U6wvtBOL?b`8RBDb#NWH{IQd9qNu7j3y}C3vf$$R#f+n*1>Ss>f@)639sngG5Bt8 z*7>h2=xB*HD%8m3r#o`W5}%PwmNp{a$!(LmpEbZ4;9coT*qiW?k7M?%USo#RruY&j z=FJ`d70z%?3<5!yBhj~|nt7cMRY)(Ofe+#4ffP^6T5~4jndbpBs4sxh^^>y?Wo+{X zH8&5;lZf2$=W*i-dBJA>*X2h|GW9k-4fR)AxH6 z^82;CSoLh9HJ=sL{ z<>dCCRz;ucG}q7PWXN%j1256u!os9&On6qv<-mo3C(o;ot6)V%_AKok>8zetac4L_ zG*HAmSpELgMd|E<`;5WeI5NuIcpYxV_ruN7r4*mWxUPbiT#!uw>BNtHnvyqO9k5(4-j>bNd62I=G{7jTDCxA&@CXZ^O2XEms#;USXQ{O|vZoxWKZ@wwH(eZ=H z7z@^E00RwKC9jBzSa7x3cJF#xX__*RG`YyRFquX$6T!zU%L^uxu6GSLk|q7#TH=wl z`1m44G|DA=q5E_Lh4KL4<+*&}xRK60ORsJ1g6CCLc}|m9587tQdY%C4g)@G&VBHSe zx;w5`on@3pPtUbT<8Du0+w|men^wTWLF<^O>UsI3Rx3I(VB1 zQP)W%4arnNE8?{}=v22~yibp*?Sg|ltxF#H@WTIDoEc;A`jgR#2Q4m{byYsj7xnnh z8G)GpRS@p&zWHf} zs2b)7t9x5D9Yo@!zUE}Esv}>3!Ik|ZpuSN1daRT|qk4Xu<^!zcFRoSym%`z6AU8kk zU?%>5T%83_mh0MuDUt3FK^mkL5Tqp)0VzR1LK>vIrCSuFkp_`QLK>9rl5XjiR6;@j z`*X)}|1-{>J!d?){l52oo;%jM7E23J7qd5;T=R$^H}0&KU2W`!9YnSwpxB~8^sF9D z_P;*!4}zub0?P7fah1IIHC3|eOBG#eLGz{kVFm0OFMNezt#F!qk9+$~jAHp_q7)>N zOuiizppbH8wgyFoSg&KVjtjUOm(H!G*n}OY-IRXR}$n&els1 zk@K>p>;daLFR*0}wZSaZvT#m60C!*8_F*s%MT>eBs|fqj;{kruL%++^U7=P>u;MQj zEoGHp9*~M6#3&Hg1iKT9WB2!yJzFJ%V!sfiWTF+2wW3oX=l-01a^S{6hyPYCacNSt z){aeO5>y(xZP6f~VOo11HmNI+O(KaqiDMEAf}=YgU(Knfs}!>+sGNo*_%A*CPM{_# zw;peXSF=x2g~5aT1M~@I=}FRAP4e%1;~lz+mIafBu}pC^gQjG88n|kDU#t4C=9+H+ zr>>Gu-}ElW)gN9}W(@AIf;)f39_6!~FmC4j3eI8LwYcEC^GL9o@Y0^%PdrVqdK!<+zY_&p;F~ zv?;9T*w0scXr|QJe+;TzlOtl2zD^NAKGO>(G%%rY)h>bxMi#YGRBSBu~OthxakIezYH70%vbu^H#^(m1v1)fLC4l zLLE$HW3$iFsv=!t(Hr-Yzo7neS-otT3g#EX?qz=weH8KmdV$ttkiZ!(n8EDsAaJp$ zCFQ>8D;2r2%QY-5beuwwxji#oJGWMwt2^G~To0CVu%5}#2Hps!VL#82$Fd6xUUhY%t zLDN`!hOJnqcQs2`>k7AX*hG4SKhBxs1a;fG%XGniQ89I-69l96Tfs5$s^bPj>^v6RLb#+S`?jtKmFC?Z=i zDJ>(T2$|88Q~~#vVDN8d*~@eSK^V`+#oEDxWlXOU=y7^b5Ev5qf;cxHyE>q z{MS3Ep}odCONtTkIlb0QLdRp(OUcL>D>Dh2y)On!ebp3<&&9*gh7_BIn`Gv|;^ta< z%MBKIaz#4rGH{j<1Qx?nnkk5LZpYX5j-v%pnHkJ?MT5nrLZve7rLsXv`pBeic-qFV)D9OQ2MH0;*B#%QkhJ%*Ugere(j?oPRyQh=0Pw4oiiuf6}GI3bb?IKmPo7!;?R| z`cN6|qkupyeu%kJIn8}TSyo9m*z8-esvS*Lm9cK}>>aoTBjT{6svvvZPrun+-L%4! z*mZu0{DvOYG*$};*XEd0#)NRERMA`t5I%UG42+;!837JS+V}n=7wy|#WG{|}$i%+b z4cdNdubmx;^X$^;586qnhkM0^qmQY??F|LZa9NET=!RP9T2$^`Gr>RV*(nBkiiojP zE-~jhXetEX#L9OG*tt>~@_hp4)Xp=+rtuZMf4XxT4rI!kt&feD!ah!htsvzB9-1q* zZycTQeG{CM0Gp?5R@z7`+0&eR&O=nYJL!=GbOvO zm(hZ-I;5blZvEupPqMZFy6JTrKCU{BA6!hiwd=&f)hYYHH<)9C3wNHJ@2A`8A4cZ_SXz$0zS^~@5%2yk^~=8~Aa zYx({hY-BYDrsw;Nb~Kg}tXzyY2TS?PPC8UGMUsA1Y2>_@xq0^-qx5E4`>O}6xav+? zt~7#5G^BHNTmR7*{odzEX24o@!?|?g7x=q^#FqDvd>jSx+wO`o%Kf?7Bk#uyd~*uGnD28K}B~>n_oqn!5HV5qvOLRR+@+7`PEq@)W>qx{fc*AP3No#84vcQ z$<4xbXcFEZ@bW1T@2utYDDE`#55k6eYQ2Irs0V?U!6x>b5|@Z0c-TV9df`Am>c?X` z?=oY}JHF;`q|D|`m-yc1D#ER5bfLoUT02!%^*QU3dR!WXt??#BR)q};xe(RcaLcZ_ zi7Ncr1?lVhH^!OQ;&cOvtw?nUkHgB|l!qZ~EusnuVTJ~xUvtfvABFYb1jY`@L}blk zb=7^?+=(_5TOIK9?6R#COzG}IoG5p_=v}qk=LgXawNf;tT0P#Bsy*$b`5Phl{B9s@ z#%`1mAMBuh+Y8dO_Eb6*xtb5-9ZhjdBmz2eANb%)ztu-_&5-Wa2q{IBbbpRlQ;saCND zkkpo0FC?t%SokbCyu>Q>Lq*qNc&uCDp*KGdOYejnq}h@EZzNB0*=)57{tvhh!=w@9T0@CJj-CbDH9KzG9@bvw3n#E9tM8^lt#`pL#L?4%S9a_hAKQveu*RH(= z6J$w$yvgwNxz-A(Lj)O;+Ek&GVQlaf9RrfFXQIqJkO!=)%5g&WF>1O$f(poW(jWu` zdZXhspFgzUKS%Izk4Q+O{ANVzkMF$1DiZ8Lm|LLSop~RJDCayiGY`ah^aO7EVW&3Y zbn4ou-C3>DpgfSH z1k18XPGw=RbQ2+MAcjgriqQaE+^7sx^Oto@uZ4qEZ9zK#=?MrBAT+ZZT!7L#eBOlR z?^m;$jajo0b{E7iWpWHEz__MAozMvvfz&~F@NE~VsHbp`OA(lU?;Q5_p@ zqWfrKd%^Mpj6HM~pKb7FzAu+oNYS=3CtDE>#z#x~UUI5`$P@Um=o-S`D;xFZnagnqa zQ7_m*JBIg!<-G6(izG$@%eke5gi+7@ZFO*Y>T~@#{q=L+`-k;y)#y)=PGSEuKz2#` z?u{?DhRmc~;ZL_~!Q_`I$9O}4=+M|%uA0eZ^DT`6AEHmr|LB$qgGY(te|E{g`x6og zeRyp^%j_OWEowC-f1@!L_sIub-3E{3dmu>qqlZEJ{W8PJhI=6;tASZ5SiB>*I77|Z z%@KZD6-_HVW<0EWL1$Jc&T45IHPO2Xy5j^rgG@8)?N}zc0BmMu^PrW;R@S`e@S0gC z+8Z-AvWKc$kMru(3`Q8Y1*=o za1U#89vJ59gzb7Sj-*Br*796`OaA7O_pE)V7H@z#iNIqQK6ooH#`}GMc>#_c>x5jL zkiw#Jx%f-ivD5D6Gj}D{A!mld#6s4`gFG@cVE$ehYySuVvILB)m%{jmM9Z&lz)($$N_wMy6=|8>g38Ek+;TEi>KE*TzST$f9X{Yb@JtV zzHsN+5?5JemXCZJq~3j-5sdLryhx$to}DX+nsSm%h6Uz6W%znrKmSbMCC~N)PWc?S zrsC<{{BMuNkcSL@LtkM<1W(SwMmDcvT_J zIH83?X;q4>auw^U7&Zx*dYYX&Yx|lUa-pX!G9^(<{@dB`=T`y-ET9&5LUB=gD^bAS z@CwSnT2|fy810|P#{SbF{B=^>qx_1vuI?T_bH<`}`L@O{d6fkh-K?;j_{VG-Qd1-H zo2ti;tP~5eEl1>Esf=>v)I2oe+;tFsxBfNva4H@p)4%6YYp-sPEW4q=*X#Bu%SiR0 z(geQcpt+In)-Ui<@OXUP0D|+)XA%77NP^E3_bJA{O5MgVCoo2{x~@Y5+bJ=aDnyxq zKON4VXRQY6m~rm4U#<{aD_RBDQ2G39JBnp**R7{tQ}R&Cqq*3D<(5ldBKPiMie^-L z3)D3Bps)aTnqpsuG$B}jAp#cjSZ!AnckZmo;MhTq9YviX+!9 z)h5pM1V(z27y9k>Wnc_wBd*95z%Z$ueXl2>_@N@QP-w+zQFy<=6YvM|lVvdMO3;-0 z)IMjIcX`j!<|m1YyUr!MpXX;+z5>{fHm3>z0hVwnS`5Fr4<(@M^waC39vusC2)pe$Elh0V?XBD5*_BgI`^J^6Ba1iqAOA{V0oC|khtlU_A`m~Tu=H(Nc zb9O5bIOdGrp8^}wn+g5&s7=A_3LUW%$K|YX<9tR>>X4wbm`1Dll!zd4vJILyE5<0< z(!Gud{Cr!_BphalE$CskYqF|bu5?B;<~|j}Is4HW0u=Hf@|rL|IE;62;jp8{M)tmH zhnyVk2+$LO4Idv6fm+J(BbLa2y-l)o`%s!5keTt8ng(4|{Prj)qxk{)b>Y1O45zAg zz)L-oeGmUPkLUM1<&V;GoVW%YLX#5PGu7{|Jq3BzeyS66W#BdbyTIb?6K0Ncm8Wn+mNaJ)N?dpA{MZ(u z1E;Fy&iDqx{L2+7VBn*^whd422x$afYQ>ow^$xKWJwGbg)*R0!v~&z?b$1Y>X^962zuLgQQL6Je zYhJKFk1z*U1pp_aGT(Rs+MZ!uzJO;;q?QN5tvlj!96E2O8)Y<_8Xf5RgftMZTZnlk9qri@Sw9jtW#>o9&;$UGW@*@5+1FWG3?c|MO-1{z~{>A#o?! zvvuOVR;QxRK>koyDhmlIm6l5xqh^0pmws4G`-51;jWs=>MX<&8hMNvIw^4LnI%ZBy}lZG_=rLj$}en>P}baJT}C8G%~Cpg@V-+~IrjUO7bx|ARI*PZz!k6Wl&+n59vb-~gC$di3P8=bk9yrRq`i zb^cjCk&pGK1|NQm-1R21n;+_waZ1$z9by9~y3t-MGr~~zM}Ll?)@hEeZCx4H$X}Co zvFGzk+nmE6-lLgE$YIbuMXprUE{jj zJ~c0`pr2Bd;EDnJvFF8hNha~`kQNK*5d7psdDCN|y~C<%6;>{+=_4rhT=N3&>A=z1 zYM_Ul1LP8&V5of)LM8#6o$W+z0;6G_uV|FQE=B!14z^+2al`XFzC<3|F2$GqX8p1JkUjOMXC1>N;U!KHQ#JmKm z?5Y+pucz3-t!{RHfLp;{Kqq!&-47-%9!x|sorr>>@RwUN8*re8JdB5*i1>j0NKg_f z2;>suTmzx{d9yOg`p+j0fERsT-xtkrU{J`F{XB;6UL)_q#k}w(HphegOi@^`L3Sb$ z=`m#QX@agNG_kQaH*Z`H%7jFNTd_+$ zbUACMtdAnkp6A9$)?LWu;GZ+CDJU4vnL^OC*E8M}Ci!>*WM6R~v|0W81?Uv5Lk5%^ z(XW^beAiLh!?C<-c;^|j05>U{y-pUZnjrCk}%= z_=^o-!ZZ^~ljL7v{yYPKFO-7Y3j9L%*+CY_jm@m{b~u1vKTe|Ge8#TgCT>>pCA?V< z^F@-nm%2SSEgxycUuXpw?q)WQLcr}3+r&SXl7H`P<>#0LZPfAsa34*h$FvbklaMw* zs%)j(&ehj{^eW<+Xa(6x$nt|OSBmDy3xc1&y-`nIMPwFcsbunM$}39)&OfXq#K_L| zoN9m>3q)Q15AGS%(asraSD7b`og=1pBxV62NvNwML0y9hs%^OX9zWus7&qAcjoxe=;Q9Y_{LaQ=GSCFZ4W=K7acm}QN;3USn zL&djVM`n4}8Dhz2cO}oRyo!1UXaWR ztdm0UT+Fo&P9TkfA8es}O-a|m!q$`gfa2f#9E-`9Z6DeM9~dDA`%hkJ)8K8?zopNx8t(oUcQU@IwOyl*yy-F z;H`P#4qd*r`30*Lfh_>-{&3k4`fFUHPz{veLHc13V)Ycr8~!Te0%i*(f{;}oy68Zp zas)y?5fDYeyXo%A_JfX$qfH+{t+i5)RZa;KAejXOTW`3WY#*0CnllvAFQ^2Ey%p=0 zyljtm=aT&pIHp}ST`@)-bjRR3<6q8}E93`{>qKa7x%~6?BPv!4S>Hh>GK0{0h1sD#3Y77};yIulr>V*4 z?u932(!Oon*a&HDK=H1CWNYh+vPji4u6H`VR^N?^@vKf9x~({niQhSRIWKr%r9M7p zb~G7?R$M`@?o{4by-PS%D|q9=!EDq0+hBloA2^FlesurL1A#H`;M8vWHxkNUH7v_= zT1LqY?Y(K45b{x_n9{wN)dx;pFVf|ZL=^;DEF#oiko{8OD7AVeoKCqDPY27*Oz~gM z1+WNh+q6PPs6Xdzjhhp6^%Y^xYdti>h#eIz1Dt~;XV5%NKF8g}A>&yr?YVRrD~8vQ zk8G6P<(2D{3Hgq(o8BVb4S9^KzL3z(WW=sh15am<>D}UH>UiCb4x~HoRgX!GzZlf@ zaT;0Ul)l9ftDRfvtYfaPzN7G?Xcgxfi%tAnS3NVC3=)*t=lj9Pw3!!eKG*r@$hJuQ0NLvszA zq2W>{si<*R6bU7&P)GUtC2}3#3Ku85@{^eFA-I&mVJHV}p+-#x9$I452jYVcXa-l# zwv+~SlFUBkEf^tFCPH1}gnpZ>$Y4o+33e4FeIH0C$Qqq5@md^zhTYc2P^U&oUX<{J z`&ig({jw79CvXfR@|~cUGVVy|2aatZiqJ@t^NnRXO935}9SS0AoIEs3tfkvaxkqp` zDDFjA5(>uvb;>}>Uqvn)ijqW54)L-+%^{)}mLf-3vH4BbkX#95O{96wAU6n&R za_FnlT?aISct(H@H>xfSBmWo-je4NOnVBw#+|S+N&Gd>it&oZAMEJ|b5qm`kfdsYF zGM8`@guw{v(2B~~j9*q=t36aoX35QG;xcoBX~kI&_h+P~9cHkE4a`HQV@Z-;P*hoG z;p|j$XqNiwlI_1G>2}xxa1FRjC|JyGF}E#Ff?C_lNguu4Ny*6VJCfTVO|@saNhChV z<2f?!km?!Lf9JTbocxBTVnf@MP&V!zz_~8Bq^Evl!T*uHp+hke9bAU7#H^``TeFe z=R@duEFZeeyRYkiS!4D&PEra20&ZdQ*6t8Ln-*F@I8uSyIZ8R3g)qY72jshW&nckz zK10*<*kqc0XUO!tfcS$f!qt28ZW09=$-jjm7Un)*PaTD=$!_6Cta>%}WM*-CnrYyT zDslzwoaT{p>9|3pQFeOjs?j;r5c-udLuiO@qoLqyEC^BYuw%fN%qOb7n!;qY0Hurc z`&N$r%v#0sTmIC>-8w<;FikHn+8`qN3W%ZZF>4*SwmmH}XH4Da$q_m^OPdYpMrM*a zp&TQ9WVxI=Y6!p?Kevi0=Bo=77pl;a*_J?KTl3M1_Y8qTD2-$ z?n!p%{pkKabR@-6@Q!-{hPWnSj#yEknbjwkWY8JwM0enx*i3qjbbs9Ew-vm8w*1K` zxKxHyv2rm7geEhF>yiC8VcXss&qb>NJf;d>%+L-6m$5}abF+2h)+cd)&w8w)5%i0%;D2B+4KBo zP_{#N+FE|Q_u(T=3>;dFX$8z-n)7vO3lSYTHBu?Lb{cQJ7M2A%E=hX96b?^hzo{k_ z^YI#z$YB>l5_%C=htX3WA1-kASgGmIsN&ft_`VAGLY+nuYIL7TL1U-;=_qOEA=L?i zJL=8y=tv1kkqq`DGA7DIyNdlAdM{=bOi4KKK~OSXoE$RVC!UG3wMI3R_h*M?L4d55 zd1|;M-*ZgC;3ejtvMhS zv7|oInGA&4Q@};$S5=C^tU}`u-#!*iZlxP99V#l6GHAcQ3(HCK*kYY&gR40%k&~Nn zy{L-Aqc3b~@)?~=ZGYG-P_Nqh_+O<;niV&%;nv&mD}CaT(=U-{3A_Nu$R;U~7lm}m z0(c)C<62gxP3OTp?YPPzbS=6H^Sm=xt-bGk(km4w`EFaKcE0H_4g+2^!Ck%73z4cN z$zW-}@5`RvX!J^bI_6nNJMG8T$5$syorV%o-EF133WaI*)%*oxEXk|KOC0zmj>KH@ z0`1TSQWy#@_fkzrJ6(yK@UzA_peN$J7iOhV-#C*OhM0%s#EQ0ay{lYr&aZg(*a3gK z58w^5etE4dOJ?t=Ok8t6`=YeLCL^UfSe~o4O}8hWQ=2(n4u{E45ck3LwMZgFyAKf< zQ_OB69{_<6CS!MO>v*^wpV-gJt-Jyep0a)J<634@Sg%yfW&0?I@*6TLoJBOYrkhW5wiHh6kDXJwe5kfHm}PZAdCY3xzHgFYH0&t!?J|{RYBub2V%A>M z2uT~-3n$A4x1gpY8N>ZW*gs4Q6{ir<`SDpiZZ3S)KPW^wv@@+S`UQc}Fm>d4)=JyC zNN>{$q}q<+MIOhbJWiWIJAlyDra150t;T8KT?q?Qxr}Mv^`DuMGBM^t5_?1UWdj%q z^4W82Rs8v))MA{cJE+H2Q|F2gva@~&@_(5;1JZzrXR_Fi-IiW| zKu9PIe=ZRf@2zhXzb18b!JG60BDkpAx>u@do@nM?a^h!|0t{Z%X%kmoJgm!VwUaz` z+BUUJA50vUU)*MXnH4JS3+DJcl2?CP55K|WG9|V&@^NYlCA2TLxbq`gdrU);wfR4|Ui+Igvhm;kNP>hMaFoIXBc;P0??We7J%8oBTE?vj$DStM0VnbI!?E5Bh#} zF+>WO9c5a%@S<9D*y^xspiw^xdytT(8J>h(ToJil*MrCX_?tO8poIDvcBnx92PulYf>R)lw$er021!pTKa3H72i~L%S zyyqGx*tU;zJsUc`@^%Em`S0URyqz!!x^NZl+H^e`vHup}4_D7eIRWnqsog=7l{h8;*-& z9Fr*aheMw>_EO;;tZmHFmyU;@ghypcXGWN!7v1f6)_vV0#oiQsaoEoOAi?0?SIChn z!yBY@lWpr`_!%TK*{8!!N@9EsKl@4b-52@yOWMnHOwl<$^`+mr^R*3=qe?rM5(kU; zRctp!WLB7#Ox=@2AOWLVhWldsFF6PlXfCce?H=U!y_B5w90%bN#SMQkZn?&t)-p{c zjAhs}TEHv@>kHKkRyc^42>8P zs@Qdlsd!PZ)`41K*-AslUZy3RxYDc zhIR0s>Nr_=U*%R>Nf zipwYLr_y$DoQbPMn1;He;E2DZoIbY;yVgmeTBR=r5~k<}ELbY)wddRQV&txxSxLzD zwxO;DAynNWM?i37L{|#Y+1;r2&0lqK$?ikD-8FXB#!mfuS>p2I#;eeFyxRhAuiF3K zFK}u0w^Op!Cm}z$xnHv_opq8?;7_?1idhk=y@2re6yQg^O!YoNGL!81@@iq-_xuhB6v}1s)x#{)_8dJi*a&%X3%Ix5mx4TmC}Yyt6szaqfPao zE2j35u8uvGxWKQIhEKI3pA49iDc+lZLSYYWOAC~q=j2p3mJ|xCV8U93pJUJZA_nw4biG8q|PB_fY}T?Bksp3cfuDYHtUPBWbT}}tNE#) z)S%TRnooV5=FLt$QC02Mjrx6N_vN8SzX? zWUz(rS^hNHd(}(i=;GOKH)I7Lw&?=;B=_CjS^qv9H@uNEw@SBi z*Jtd@2Jkb#1{Oavvm~UElOQIA+G&ul!3!YyGATaIRJtxGI!JF0VrFV0%QM5Cf{aFuQj9hkF14@ zUgi)i+~)S?bGKaeH}z?k%YTyl?-UI&{#Um&CD2%=zHOt^;a3xP>yaAKwR6|)@%|6N z2OzicE{6c43XwVT@yDnO%4_Gd;ZMtNoHHzWo@=DmWz7D51dzLhl?KRw|w-yx$j+mHK0c>nEhRkZ#aiopot81pmrS+MGC8PPPwbkxJ{> zt~Uy&_2-njiMs7Cz}gIHU13*kGi2vA!AqovbCQLUgv2N7FdbxhuP?J=mW)O-#-g3O z-FBV-8Nig_Qf#g@m+R9qrx$*%nOt*oTZ^svP2ltBjdl_F16Ww}Qz9>`vMHpjv!3-j zZ-Efml-wN#dYtbiKTsP`MtbU*i>~&61nRI-GCwExzNCj#;`h=!J>C{2R#vwRQ`61m z0xzLFy1T);#7I3g?n`tG^=bsaU|j@%DhpSQR|V@M<^titz|)i3ej^h@IXgn^XVlYI z-SB9D2grXZY5mPZ3_FT~^C)Xi^p~XfGDQsWT3}s4v!e{6sbmy_NaT zt?r8vtLWPlR}~GdCj0LEylE*9A-Of<-S|(bF^4R?68lA~wGTI|NIOhUI5$5CEuXQ| z1(|fTvAiyM8lU*M+Qym~lTaVOm;LCmd|pTFN5ohbbWzgZjGsBfa;URk8hkZjf#^CrlLUm74_Ft8o-(#hW!#*7Guof~;9PHQ1t-0S( zOo){oMqx$ojwg1i&tdrqzqdrg!m^7#TTXG(59G{Ksa3K5d9>N zHk7OhoG}qaLtjis5vx|marKV*tegv|UNUjx%fb)~=8I(jxR`yeXEU-TE~Gd00F4%r z(yK(vwDq1R9bb>X%#3_z%}p_^_I|i^c@yCS67}eCPg7NwNog@Q1dJ=g-3a~1rupgj zUmhQh%wW56UvX+|*@q^;djXi59Px2l6$!_z&SCs*RqJ=;CLFSBt8q zEwt*|^o03kb;#A&yY`Cqltm3@#NFUloEwF*UAAGT52RV z!b{^7UMLIiPL?C@tRV+IRwuhCZ7YcV-#Ys)$|Pq(8Na$Zfgfzp>f-YJcb=YQh{4Rl z`gu{yRIBWo3Ya8{xbt8wf%Ez0dUxE?hx{;wWOlQhoO&c|)-Q$*wu3~=qtx*l zre~qf-(v5OiSJLe-V?(6nVms=fQJClwMF;WY{=P<-z{?SZfIc%%0c%Ge{@qBA(hX> zPg%p#BQ57({kl6R+_+{J&#Wnl8G?aERTa`@#Udc}%HM`eyjF#Ao}=W`w1;^=j;2{* zaqDQ!|4x3y?$?D_+z)w-R?T(iW?BB-IseFq$t-~W`Bymszd^GuI%EO_OjQ;`v#)tf zu&_@+=^!acggXH@A>~=LG86KQI)TJ`V$^i z9o7;axg)Up8pFHAb;yds}pE1rufMb!sP0RbD*bS|$7W_^g*Za->c*p#Gm4 z$esPMIp#o_b&KZA={g#+(5f`0oy_D)>u8$wr`@@FVwJYke&?=Wme2uM7jrQUQdRfL zvbq>^hi!tHtGGTa(1k&7Rz#TV<=NLe)>j`q&R zSYB!pjZ3Fy=xxS$(`qRzTnW=e3uY+APTmhsZ#xlPV@)aFw;X7_&M23sfYMNyajBi= z&QjY#&#;rn{(BBSmdJn3O>uY347Ud?lH)~0jSC6Y#aGp4z|{57P*Ws}znz?Y2`%)h z1@V~$UN1JksEny~j7eRT-#4`Q+9mdQFU~{O{&dHE8mu*~eJ-;m3Cg@H)x&VttZ0vJR=iXH`X2uy66_126;nEoQfripWI0WFe9lM2ewJqB zDJG`M4@#cb(fv`n(xS8z>OVlbglZPtKYxp|c_)KZBUkQEU(F~7%CFMzMSMBi8_n4x zzux&=Y@dAgF>6oPM4qHGb{f*NH$v!)RqSCCpFVX*{8G|0J`|=3PYrv{N7DV~u`Oik zDwE}({5u3K4mRFuNNZDU8~YqUgZCH+clC+;(tGi}sh_4?I+(YtszN#440^MXE^L{t zAEZ<-Vj-|xpRBtcqLA8A<0bAc%m^5>D8f{qre&`Gkx*#OjmZIylB%z&SH|>J+m8Zm z?lEhw&P5t?H-hI&I0cgBtkQ12DkArm5GiArp3JKtnQCbyWG*h~?T8Th)9E94>J5U*wjP+C;Gem-xBcQ@) zT|3Ss9bz1ddL$X%EHt1yez)5|OPSUmy-IX;YVwM1`8B@KYG63731(Aht+xP&jJ^}s z{A1S2GU?!D9}b5Ray10E)zzfdT_sh?_I0~48I~qK{*(<^pg_4UNM%a6v}|HHxh=_b z`n}BMk^r&&E1_Y^@JKxEYm^re+R=BVn1aTYBxVf+$ggv7lNwJDK7RMfzdpBywY-zL z_!Fwe`ky0Y=?pUBrsp7@(xfjnY7?d3rVR3sdCw=y^%AKE31r1rK0G@sl__IXHEKFz zE>r>|R3l?!ND=xzXG&UUYECr=e%48Qx^NC+V=1o>wEsuB*UE#eh;G6Sp0fk znkvUPXDoFWS~z=Z7hiiPv6Dln<^Qo-(R~+VaYC^@biYJ8MW=$*J7)X@2%%jxh1{_c%%#kUx7kM6`H7_m3&iB=5iAfiMr|7SHZ4+Jz|$4 z@J7IUW@Dk1bIJ+3C9J&FQ+VJ)I30Cc@d##2YQAyiG`p z$`F!>P_;OpH3h#9lfFi9tb`?rcZ;E%)9sq+A;K~9#?Y)osM+9k`j!>Pq z$jfgYUAb2gkB0OItxlWY%cyGbSj0u#|#ib$l% z_dg?O%5NWJw22#_6M8wx|6R>QM~NqJ>7M%733GoiWa?H)N>!ha&)0uS9!hze@1lqq z^>Dmcd&KG+Y`qYYr}Kmq7dP9Oy_-}eQKr{x4Wr(_GY8A$)!Q~y&4E`-8L{<$o^C&! ziOTlyN8_R3c_kY(1B4v6imdRrx%XG1AMyA3en$f1WBeA7!10%#F^f@kP*9J@QpgM>@uBaqy*Uglo}3nB6QT2%zs4^zQ5_P*6wv{K{ifYU}|c zMlHy)&mEM~DAraK0N6DS72WU?DD`h&@5GIi!MpUbU86+jesRmb?$f`@ze7&sAc~cl zd+&l^61yUdGHQW{5}7T>#C`X;vY`HN)j0tHprcXME9?S{@Y}iRdc{_sgx$(}RQ&p( zT-O!k;ncbflq$k(VY)8|Acc|RS~2IH`#0E0mJJ0K)!Pu(z7va%N0m4~x=>ZS5eS0KvpE4DZYKQb6V;qRnxt0jeLW!x@l{yP+05#5xB&uX~jkOubV?x zQm-oyVxg|FZ=gDQoU>BIBwpsv{NT0pPWR;E-H0m^JbaV{hI`DJ3iQ4RVKPL)fY7{O z{6!n1j>^(QdhX8s#a33;0|3>6J3-(ec*@`m*!S~bKl4fvOzMC6b1e6Fp(DOUq?{Ip|3 z@u1kWK+QJa0aq*b{WpW=$vYdo9&I`w<&k6H=hQw`?r-Y;M>FtG<8X+M>`-#Kw$94< z8%zKK3Krl=?xpNiC{R~!&Hz7wUPLpHi)XTAuq{x3f}#(Ax$o&dE2P~b0O0^v<=q$y zx=%55KeSVYoF|_MK~7N?=TV<7UN$273N*^A-^2N>CdDN%-|<+N49XGztq!L~!Qz!o zGs{f+eO&Mp0kfJaE!D;sGUa1RW_16yitb||99bi|O1b5bVTo!}|=P%@&Z# z#CCtOXT~@YAG`6gg!M7$2NOOj?}xt{0WB4{=is2=at41nMFVs)m7ea!@XHiO#lHYl zyffW8H(lSgV)FIYQ#}tto*-=Ust2c~lb=~3x8@KEKU-WLS{g7#P?*OJKiFrXKU+eW z6aujE=;3d>nVaAJzfB3^loUa+o_>!>j|*;icC#a}%Z`=kQlxCLS2t*6l^ z?O{C1K*UifR$s}P@-XX$eg}~%Q{m+YMw=HDMom?%0CRPO|6=0D%K55H^i(eemN0C= zwzoJpY$8X1L318+_W+t$xxJMG#?l9&->VV~&ft2$WoNhr!Az0;<&h?k09hSgJ#mChh(N(^w&=%O7!G}I|@=>%5?H*yCa;AU(D&4P=hLg}rdO}%Fl*@0& zo%pnWZ);Xa;^aH}LX;~^3SG`t#CImkmf=Y^j*I4%(g%b1_(*_s zjIq@7DY`M7t|Jp$_%M6*m?IS&{D79AB9t~nLQmWfros*QF17C;B1i6A^##mxxwv1&Y|U39S>6)ObJz)=vraA+^hpcvql6FVexJRHhA z8P@mj-z_@43AE{~I{LZNL83P%UN^D}n*H|cx@7F+wZbd!gx9#kBJw>)9#)NMsZvBl;QmV1O2Z9E|93J*A#ZL6)en{Y@|t%srA9Dwo$uOU!rGR z>&0zD7cb2nyT9+LfB$@GT3Z6zS|rTP#>yz`J3GR=QyV-IC}6aRt9)Z_-0)@%F%X8s zyy&3Ld3(o%sHZ51Sc)fZvbIcSAD}#1FaGuZ`pq)I$2YtB?$hroa2r#(EyHPZ_~3gZ z2r%GvC*3Rg`d7)yjk)0DPuW<_E_b&N`$ogf2A``HAtL$bP{B=YS^}~mj187T(r()H zy@+6PC?y{k>@pg1Q2za75r-WI(fO=ntyHuG`Cn86Oxb`HPI4}`l}c_xKY**J7i zm`UG0hurIB8D{C3!BpXit2fsrgRr!47rBq4y~5$-EPH*+q=4xh8v3WNyZ?AV|N1}= zulT_LDJJH*La2)_B2m^;!^4%%3O^bZ1$;n2pSwt`td44wi)JnBZ|P`(W^t2q1yq1X zx9a{i9(KvVW!NQidiC+|bND%;mTc7}XHnJ%a~;G0*qz4T`s;HHBX9#gduyB?tE2~H zfHMt?3{Jv&<_2g%As<$L>K$AcM{rRo&QE;=%`JG|x_B)|CI9{Ii5SI$=nZjLKJVb);) z>IIqs7)g6kB)Apm(B4!eL_hTRJ$d~Mx~4RI>DPq|PjD<@ozdAgJ5kOrEdaS&ofh=Q z^C0Mbs}+FApOrVh;Y|Mk(v^0mq@O6hz7^JjXVMcZ2ss(k@SA{q_hqK~-tAAi|9tcR z$WXf|(N6ETFO=UU3&f7(?&20OeHa#S!tC9&@omiJ!$- z!PKTZAJq=$|6F5&O3-Tv4(c!FKqAo+dL8&z5%7kYPQ`p= z;D&dct8CHbynWXV|A!S6Kd2_LPY7>lbuBap*w=lB5xs+fq@chGkOFQ&LL~S)qLNNQ zuI~5CFY z{THtw)1&3#JhZY)?a$B3@6r7rVv^IO9L$`R*E8pAJ%lm-GE>B700K>NG+rHEm}2hx ztcU1w&&5lh5qXQiJzW?U4;_>cBvV>0JDb~TB~^dYyW7?9RBsNa1xduy5VfKD1fqr% zOLZu94_=D`x#~P&$RDt)gkw=9<<#@WPOSfQ$NwyLS??{tMvVDt1EkZttid$1ek1ES zD3fgBqDN#F-j66&9Fsi8ry0D2ejc22vz|{3?&}>Bo?g8~{cTL}p}GDqBfLM)uyyi0m@Ty6vqJ zNw$!^GO|i$xXnb_gd+aex93-ne((GKkE7!`?x!O6_x^mYabD+nUNOrA^Rb0$4Xt|` zMHXi=A1!ZCTxtoL%_K`Ov-_OmL1y#ugXFuDd}>`J^lK=bi{|qdy=x0b7a55ZM>~2p zI;WVMgv-8}c-bM3TE38+JDTsQ?bTlgD8Qb>dAY$aZD~Vo9q7aBX&9w<76!hgWqv{n zZVse0Si}d|62G53}s}7&E5DQf@+_q<~sC%6uZ&bztK$zAr!VJ%T+Rr|3H+^Pt4Of zFC?A2^94X#ThiwSO`Ji4)I6H4uy$>8>9ofD6jB6!i(4}DOd&Bi!LR$S;w?(nAp|9Z z<$1dJRjKJ4qFg3X>#IQRgdH8EyY63h5YeM?knSqH;4iA4Z|R|QhG^PL|=6eLUnK{ zjyN^R;|#p#h+EgecUMw2oF88B$kQ>!B+s8RK=z%_@uflg5{OGY?q;$V$8m=AfyAe{ z>jo{>0pot+q0gXV1KSO)Dnhf@{AcXPgoDebnQ(?wwn>HiXCzN79E0Vz09}-2Ufy69 z-%fX_#*xK}>jsCB6L%azh&G&cT2QXjxs$P(ArdIC@~?fqyJo!nn&yJoV|nmE3C|o) zuvS!i@Fz$7^P>NfKhE)K{#1MSVz-z=U(`@iD>hW<=6k|Dli~yJ)|2%=`HE@-@iqDd zu2B8iU(g|owUXuuoVn4n%3rqkVUX!{>$!w$*PLrJ*$?xwhof9vGS8IbOR1)k+@YWo ziYRae8{N3e<)rH^^K<{6fm^b& ztgWiHy-cCcBvowQT(o(l=2C_CSVE9!A9G70T1e#S)Wzs&G^IU!^cQLv%p>nm+1vGL z2dPUaMtWX8U&$beROs~O;KwP1A3?imQ19~{vD+y>7RS);4&({?z*%&XnuO1tkgW4m z=A^Ts$GS)AjrT%%-YgCAdMy*aFwx<9atQ7ZA2}(6&l3yneFqUG_<@~z|89bno@bOw zQG}_CsRBRUqWo%(D(zTMgymxvC8eq^PRfusLVHa`>D7r9A@Y{q(WUue{-o6wW%t!G zpp6KwDW#iHT@pQadx3o1B82Ola)2ST>tzKeQCvf>LA_Kk$zLG6&BEOrxzf4xw!(HW zTun!6+Ow2~bptjdzbP-bxdHPe`VnVyjVH9LlQ>So2BpbLYpUIjvph{*Xv5po zRS_OUY*c83rH2leA;qAQ`qW`uG}&PgYsBy1`Ohd`J_(~CT1l_zL0sLXaIEdCfHGOf z=W&;XXoAflQesUrqxVOTo7&+qo$?v((C_YjSUE{i^!Ps&CVXqJM;K)9Kt4Ca7A5Tp z$jhgaXP*K-MU(dC8#X$Mjm{urfmHf#eV%C?_BJ=k%D@`9vOqYFLKd-CoaQQGNZcqG z;gv~#WX&%5)F>hMGfFT)Rq&-oVq5-#;@sF z3j@ur^yMY09ZwCw>NC_Fjy0L+A!=}TlIoaHRu51(a5vo_sZ3lqe__6+SweMx(p{Q9 zw|y)_20!g1Tcm7y36;bd3$B1uBz;~Jw7Qs+D~(PWIGfs;H-u9yf9~yYt;%K=Dg0n( zoB|7Cn9tJje5NYtdqnz1%YhUVKyjQ_PhC=B{mZ~65_OOC6L zNw4|;Jj-I436>-CW#T@&Ye^n0$_IIQMs+_x6@9$UPvtz}LBI?Xht3w;csx=P^IU;y zj%TMs)Z|FH7+}tF24q-kPODHUl|i}EDg1_{Cuk#b;KQR)>R~1WhfuyoyAU;U29MCQ zp*B3f>$6b#0~UYmbGc&Qt?Ke#Y4#6~-W^rVFcU!~yV!4omx0U1Ttbkb*zr18h0~;S zJ`gjFF+lXt=i93kdCsx4MB0Zlt6Jmp4b#|MPiVkxFXy_3OuT~Lbm6-*n1U57IeO)& zl#L<(><#MK9Ct}0vR3tlORtszK&k#X(INAwfpArWprpI zG_88QQl#|F*Lwg};sBT%|d2ZARk0TVos= zz42Pcw{hC69RaprnRAmvHH69WSF}tc&soTt_#BhFNqBI+ydf-3jGqr4buc~*k9aA1 zx&SkeyW@@Q?S-rPkGQCgU##}vz0@ianwM*fPk7+65$SbSL>cv<_$dm_IFiYt8Td1H zOI&q41B;kKF-&Oc^sBla*wYK9=&@$2m%BU*ulKMS?20})UXzs_y+%TGg8$^Ef2soI zKn(M)*e!leufS-AX5JzY7{F_L1NSLEDH2 z57!^0QoU}K6Km}cc6u{_OnJJ4rP+j5m7&s&cEuse9)Bcu^c2C$aYn53Pu_wN+88x~ z!e*IWP=3Vcxj5+$>TaA%??+@p0ga$mTWhaSOzE-sT_q!THd-2IEqXq>Q%VSsh(XfB zgLr;DN_>~`ob`l?_I}5MlUKj(0lWJ)9hb8;VqNN4zn9iLl&`ajkT4`uD-^_>Lsjm2 z@7+&kl(1=`*#{B&W8=E7#3m)(8u0*el>t<{FIdYNV`J~$Ja4%t1uIz5RU1q?MoXsYr*_Hdw6gy z(w_!(r8{*geaTe2-1?fURgPY*FF@u1smwNxMb?|X35VFg z1#ySWIsjpa-b4H+rW9V*>`SWEFV21H7fVTA&^-815?c|pON(VQqH1R4pGfOrTY=?S zX{e9$mH=@ip595jGRyEPnSPJGa1ai**IS<4DiuMdaacGFTXzL!;68HsqT#aTw=&a6k=#8tbkmDwmb`R#!`Yb`jd*VM^+^YORz zJPph7g>%e#FCkBh~Mz{Wz9XGMM>T;S*L&leN%=Us~f%&-fl|d4Gm5u9sN| z+2!Ji-HC1tCcJ-r#YQ3w@Av^5L4~U3+fay5UV;%GTiaTm0x;C zA$&(fB2LYjI{CleZ>YR1%Nfx5t_kwkPJW90WHl(fK7*_l}8;be%P;_NGMkMDc>R&=nVCp~s$0@1!J@UkJ2{x_$r` z$!t3c)$?ra08f3oYPH?I6sCp64H8C6>q|lEy;7^vQhaF>X6}sKJhmngJSW~(&y2=m zvZoP5oqp@N9gIENtEVEZvW0V6*9@gxrQ$v}QquH(ADYuA_2;4e^p}?;7^6Sf>6Imi z=2p0(`DH}BlFs?70i@9cH{W;U5ba!!w+x+-qd-zwss$ZmT(0hu~!K5PtFiRJ%opZuSWgV_wnS+Tk2 zO;>%VELKB_*-ECReVl5=$10au$e?dl^jUvKp9%UYMtK40ShaAn^%b+@|AOjKu^5YU zk^(bnQ7?@Nd!w;+Ts@{G7*}g~7eWNX0n=(dCfnrnC|jP*NoMioiyQfukNKEXY(fvH zBVk#Qvh{YWk4H=oY=P(YPEUrLp;5w{IG#CRdi)9e!0h{y*s4E8Cr#32Yu3OL8P;x7 zG;4Qirg0cV&`1$JL1N=>S!@oTtz8O*!kUGC{E84}LRKfS7(DGvzqxis??6Ne<4A~T&Y_dK&5UT(Ee{$1BZ02-xqy8LlZrS@1ag%>|@W)2o%rfGwbLtrU}x>7hkm(}Ab-b)xEbHs55 zSy?#G2s;+~sJJ}BdOn0;MvX+OKf>G#*NQ4v=9`^T&p8&JcauYS6_p!SZ;l=Q{FwA! zi{(;^EYQP^PGaQlW8;&N&|k6c_^J_st7#bQFe35fY|%w5ab%BYvCF(zEyhyCw}tmD z4{u~y0a;phJeBaSY|?rguJ6^W1GMB^m&5SFPfEaX+pumnq3@N4sj+-GC3p0b&7tx? z13~1lF^>|E2WnK16k&1lV0Lhs*fr0ul`GNv%hCEZkx>^)gXYJK)oqQ^jJ;n${6E$A zuf)!X*<^-EgPR$??D+a>2*h~<6PfpL5;$cVz<6iIgM10 zqzYS-380|juNWtT`Lv$PqlG<4+NPf@kGkD0b|dJ5kGRCa!EmqGrCO9h=3|CUYE^@G zHC}0(7Ty{jDOY&e_PTkqbOuOB(&&*r&R~2Lu{oQRFn6jo*c~(0&=%xublsQ z0nJVHfl68JuK3qP@^YEosUR5Jyv?i75o-A=iRKvn7OLS18g9h-8>`r`$)hjh;!nso zGonG>!5(~v1~cB%WV4cLTu%u70{6>&3bg)ijG5K2puj0|Z;}dE3@cIl7 zKprrVx>+QxWWLfy}5(yK`DleK2FVUW{N4rlrxm zz91=FIZ>neDAh}6BkRJ`uL&>r9qqb!WC9$Bd|%Yu`Z~_BAM~^G(ek!udWfLc&k8&{X2yVZB}r;1C9}(b?Ozr?{+)adX;@xal3W7d!3QQTF#GFmpeDGaeVs z55?KGCqM7s!#ym`zb2s=9FSad-wspg^pemywRv1y<+bVr-W$A6_)PEQsHZDCN0v=z zF2F$SK1Ip&{Jj}EQEPzN7Cc!rZg;dtbdZ|{gx0zIME0O6-oe5jVN27x^=Mx3->eGf z^0hjK26ct9%voyb>XGW^&z$W!mGE?#tNkNxN^#2_bQ+t8{WbO3ULhqN>BB!hTdZ^2hTZY9@~Axz3-sC=6k;t^b zRp7s=`Jw)7@4yUb+;+l(d8$jLO+X`0<%<7pLZGAYg*>7l^Y!GlLMKi{BxN-^nKPtN ziAIWl9mu1i#i1N6XHlt+z-)Y+Oh=Z@61pN+wFhP6>^H|v>S*gAroG#ysj=#0QEvGx zQS@)lzU0j4#=|l>20N_W@#Yq-T9dw?Cn%eyDKBF;JEccI{_HsLN=)R~7%qBeOb(57 zK2kvwRoq%LZ70l-dsiYdOOo~uMAtA395+O{voxP)xOM2Ph9P_}TD|!jxrQ;@{U3NM z0z!=0{4R;_vGf?%9(jnHE0!dx_hqm@lN+~S?oK_6xQA}WO{-KWDX}#l#Mg*{EvB+& zRhPVR8J|HZMxSr9oOIV3$0f=UDzmPEi9l&4XOBgkaPJWup}B{TZ9{6l;)uKIywsf* zpm^RLttf;k>jLuRAcu}^Ly7l!F+!4K~@%Kdmc!sT|-$B z^!n@s<+hfLT3NgZvublE!O-=|FD=3vXJSrb(`^({d0{V0UA`0PoOIrkVEY2q_`wqt zZ@w?!$A(-a6e8V?wAm`gW<&M96b^MaOLZybDak~@TJ*_sa3>d~nLHJi{c}^>79D#z zR=Qb~V^IwV-GB$-E)>1RkdnQK}MetTES^;**OmcPmyT*8a zZZCh7x23GD`C2qbPj?h%3&p}2Ilz(H9s-Y%ysIW+LKdb0reDl>9nLF~z*KJwsA8?< z-Tsu>=&v+IRLWkSm7c5_cu`n=d08-h%}{H#2*f-_A+wMG)yD?;Rk5HsuM;U@M=(x# zG|N@!u3r#E`!;HOn{&;;`V=$2w_O9MX^ZRMV@l&m${{T-s4KF)J#pIaeVF_cseh-S zK)Lh}5+eJ)%a(eL^-a|#JyU_suMY=9?il`Kn2D+<9EBi$y%r+GxzV)#(-1fTB3Zni z0JOVh@mA3bb>4?C=m(wyXm-9EFIw?V-6w{kBk1}846J8KMrzEMqD0Lz=zs36&$`UC zf3JcG2R>s`U1uoIvlYqKnnji~5s<#7-*#)|8IpfcOQX_26P11oDSPpu1&9p}A-F`; z)+5D_2eWzH?snN-KJ4xP`k|&(q#pbhdL<|VM)p7j?tJ&_BUtA{%w?sT)y!HyJ46e{ zB8vbv35RWWdg(v|v*cU){)6S+_jp39oRrDN^egDPF{6!hqg@1&h8oe)gkyui$bMx% zS}IDDRM@ful<`bsmYwNK%>7fylB8KnUCv_mri$8YwdfOOyKG9Dm&WKbB{-eL-V4V* zZd-z4=1I?Oc!taz1m>$Wp>gtAN`DvV;uKkJe0sFCy{oWu4JqBOTP~rRap&%Zg$i-& z6pjOAt8p_FXf|hJZ#%e|+bi4t@mr3zBDlT$@eVxICRd41&xl@29Uhs(3tu#b&qB(5 z$yw&=Xy52x{cx^_(iDzwzxz7{bEj)BMPsxn z#DfJ4U)>rLt%HT9Ct+BL zcbUcLzN$~y+14tpJk255o_3E`iYw$BCaDTtwdL)nJknn2m6+bI>waWVV)_OrmC9`D z(%q5hpsd)UhJ%mWZP(Xql}Bn!l388=3O%I41^`<)4z*=;xQ3eBQO`<6TC6yMpAhyV z&wb&Rngx&05KWhj;du^?WTLkvSUA$k+%3QEN**@s>--%k2KRv7S+!eJ^P0Q3-4kG_XzmDA z+g2@VUabywyrQr0HzsVcydZCA(_WKoO7SWkwuKN`RVJ0XbTTlSP(QRWB3@t`XsfcY zI@e5`1w93#D$bp*e8)^qMR*%Bsr8RK%}(F>H|!Fvk)cb1j9LanX8LW+;l4Q z&Xk(9E>e%D)o^lAKq`nR5x0^g{WHRek_X_j#8RyZ?l)%^~Z@-L`PN;pq}q|Z@@V&G~i-S3%5v3VyY zF%s>zHUCfu|Ct{76DUx1th|_d|CH*1xJC2yaYg2V7P;2FD_k3`>T4nwI ziTroq?|OT{fNRFab%*kb)t5X}dQU3e$~^2AbA*!%y})?DNIEbh7aEJYD4D^{FUp6h|ci z?OS=d?gZyhddI4OACxawz)_$9I1{1Lk8Lr(-O(dBe}#5o$PN`?O8w>`O=bE%{Q0&p zzm;()&^<Pk=1(=HBf$XK+aC;yY(NRYSW=POKjM%0bWGhl&5h^F}FNzWbbA!j3xU&JH zoNsO2{X}Qz314z5%|)J)s<0dGRmWTP(@$Wj`Uc~+oAe1zf>d!W4f+LAdkD#EkC1+> zW_`|?PtcLwsKr|54UsHFxB7c~oHNP?kPX1BI>BrKKBH?T;`%|PMAY2mGMj6LScK2D z8pFhM=U~XRonb%1sFsqqV^&d$D{o2{9M6ar%`_wrYz-TbWv*4vH zwwk>gatu2!!o;a_lV6nT=)_4{-Tz*i^D~_d(UFv5du9|W_$Cfvsc~Y2!w~fS2m_>v zQ~1x6oIJo22MAEAL)_k#zz>OSP1A$-VuO6=j zN@PVW)GM0D_d#A>{#II0BTeGHKHH@$$E42`dNA4a0uO%fyO$HBU|?us%yqxt9s-ak z``eFara|ev8{VsRpKVI#P#AbcnFO^N7J-Q-nFm2+=eV0H-dlQ>AEOlxe(4s+pG6?g zD*Fi*E}#0wDgZ-O#2>DDGwrp?G6DMHf%Z$?n2yleY4gTpxNSg=NDr;EiVX<)TE}9u z9UTw?b8cCwKGBfkHc1}u!(u=?x$2&J^Qk$xjl#QPKNe7RO95pS{#StnB_r*xk`d9v zRXLl~JV87nnF9`f?#2Erfm@G_N1|BUdLHOg4mwBrTjmy==T9dPy;vHF)%6_Jl0Wf+ zoSbPySd*3nql%+*~vBb zsQ^v9b<3^`o>U`QzL3`uMM6*N*&_YQND>KfmR17bWcF3~yIj48%;Q7eJ~pL2`{W^~ z+;u(aIg|RLw|aS%0`>c)9FQj+Oqg9`uU19;AEC&9AXj-DN-p_{zW*?uZbE@* z_ieqwZ)F3-Cyy_iN6?9-T?Q2G_wh653xJ>X+O-L?YoMzEROxg~xo$=LKRnO?p9_r% z!p5C1ETc6tY7+kDjg4M02HFn-#3xqNf|H-QIiL^iWlrOBi%1516^pRCDMwSl=3P+k zoe`i-W7}}qD-zQrGr%`QJJudy)i*$lIGYa|8AJgI^byw-h>#MrO>K4h_sZ=<8aE*6 zSE!>w0$biABHjw(AP24vh^afGdahnzHFP~8K~9(A&Kc^(#A?qv&d_{`wZ0e&$j`u>Mbo z)Fu|?ZQeyF@n=)WVltXii>?#EC{=x~%W7Fl0oaW%7CeysK1Go~EKfjmI_z*1Y0g*R z3F7-#DxyR^=?woZIpxJEbjvJBh494d$lJAAPkQEhjEYuQ|D(WBL>QuqZ*p@a|B4Cn zD5@%OqJ;||yjhw487PUsy8Q+ZR?i|MuFhxr3WJM6U>Ad=tNHCP$+rbuBR3qAPkCmo zwpRHa`bs4e9p@cU3Dj7!2rQ-PxK{7uHFiej!5Zh~g+LwlrpP1?;pBrbmgzDPNQ;nw zK%|hLeRyn>;oNQO1d_xOb)d6fTE-nqphbf_DHw^*Ejvus%W$a8ygB*q3zU?V4u^Y_ zhabBj$hX}gDu9obQ&vj$wF6sm3*sFyLLGQgWfw?8XAoFWpmGx5A;jxnTPxB z+ekEIzC*Zscr3k|?|t{b=?N(-IthPyg4SMFm;3~bEK3zpQ@L`@XUd~-qpY12Hh2G& zb&?wXFV$O5+K&`Bk*b}Ui{rWeykSoCxBR7%4XGw_L64vu!+L4_*-@L_U!2p9GXb#Q z0S0+qpm=|=9581fp*fbxvhA1=Q@0!5>K3s1&>kv$BVpf=`)*215WUyGT(reHexK}}Mw+V5zG{rhOke5g#^m29$ zYxD)Aj=peA<@z6;7XmvfVJ&^!f;oaAyKO4sFB`W1L>3IJ0kw`rU9fU*;qTipE+tK6OFJQv0r! z-)&Z!sFvyc5Ux3ARhwCqj5`=&K`Tix=5~%YhD+8ybCZtYM;-shT!o<4^XT-j&s(aC zy9n)`E}Lg|rxy5LcJtpBmqTZ|4gd$x-xIYkDEZu*Zr9_Pyll}n2g1DG44G703?MPy z3X~bh*Kq>jRfdbe1KKlwQps?Ep{p6TeewzEI_TJwv@?@11;4boQzppmPGRR~kKnF8 z9t^fH<&p)0da-QAJd%AY32-Ihe+xLh%voz{nWGUnBS41O__V7vR$|0|L5=+e|nRU?GucF>RhWy^jrTgp;$sVGHE7pJz^Q-t} z_nFo(sOKvF|Urdl5bEz=6TyJHLzaKROu z)oUlU2biB-K0mYBT9%gb#bnPa^PYIq(&}`D2@lS)>?=OKiA$pKK$^htnU;z2?JM1y$TyNxhVrs9CRA)SD z2p{O@uBg}1>WrHC9Zar(MjFC(6VeCRJf?n0oBPa+kL~R@^q?DmsW|(r(u3urCY{+z zS>P=_^kCwU?9{nm!04}-KtQ$jcIyTdxAtuRgY{sJ!dM(>seb`^%L@7gkx@5S@z09y zQ%3-FTJ>u4#@j~Cd@<_ZnG&d94YO*hJT?EQ7Rb3xLHOQgu%F!`W4;mlST=Y54a(3` zh)nrm(P7tE_|n80J&>-wlFr=8V57`)kWkv&I}Cpt0)na7+zY3H=P`p4qizCGi5U$L z%ETQ5j7c(bzOvsW@D8RbA$P>m&F{`3*+Sbb#lI+~BIuHq5F~o@_4IhF4FUDz@cT87 z$kF$94D~B0x}fO3JW`{tMFI8IL-_sY8nAJg!7h4=yh2I+EjT%*pCU_q0H3s;nsl;% z1gs`+t>J^7P5v##uH2zqjBD9&@SoW5Yuv^`U~qTwnPY|(v<(2abY&lHk;Q0V?lH`OP2U%A zO>U`(;M(~*+~;lL`3m}qsZl5)CazO{03k+$r=^e}Vv`-_?fo!z@*oJbIC-pa3R?11$hbXX*?)5l1Y;lfTBGvZj@b>}@^8Os-P~aQ-29M8Cvg1j3AZu4COdZ8AdCoye5L zU*Ao>pQ5oVWY*bsD@LhAF@{y?tel$S5nupG;zi_*vuXTO8I^p6g<7~=qdrD@pAKJQ zCvFw1Y(eZ72ex>q1;7Kz!UWS)my47@>$QB?#!F-}0Q6X}z7gE)c~9Vy(> zT*Al8cC`clNU=8kpC}c9(N}hfHy&6#;jiUiiu1JCD*gm@q#qb2>mt~8SJWi5knee0 zHUKND)5JsSq+Ofp2TnucZGXqZHM3(DWGCPHTgJj-QY zV>7^G8TY#@CM`8)4q_c-;pkv%oKpW4+aTw>klSxAJfaihH4tbaYu{_ppQ{T=n;%O$ zVaJrfV=a$rDn{Fc3N`9aL9q=32F7Tb51>>I+Xsl`<6C+h&o!5%GbR^m7m=;wfy4s4 zfMm?lD#8R1U@tPo&BHsIE;mw++N45f$QJ->F%f67C1XnQ=ug;c$-pfqgOu>?2u@Fr98F@eokXjeytP9VyNEk z{4d~EDtGn87Tge`&&^IBa!3#AGa4j*PBmq6nv zaA%QF%NlaX4Lsq&MJKSyE}j)@VliqBbWo}G(7zRUrDd}Y1eXtw^F0`r>K%{gy+O1T zp#94B#{P!GCj2xT<57SL=HK1`nmx!b6yyUxFgsVPk&@>V&cV`6T(f+xgxL)56`ou5 z(z%1u)~Qm*Bsi4gira!Ys`75uaRHhz^z?i-{(WWL6f0VTnOyZbl4(qMPsH@(bQB^L z<{%V}=hGn@d&+Wi7LFQzkq3?TeoHg&%M5(j-a?Y^A{ zj+e23bIHA+%#9B7(C#Ys%oA~ff>f{-tYhO!kiwncFx6r zySly88+dCO!gmCfGH*d83cE2!^)EoD%CqPUaV?hf5Z?1+x<=^i7}dAb^8qm+p%tD3 zv~mUHYS7--`4)mYqBcN8FJ6Gbaw-B8~w>l+|j(>bfC4pb55=zWB*Y|NcYw$H8we?7FN%dH$yeSrk)w**BG=94rs(F<`Uf)Nl0mQjmdohh?mnd z8XY)JVnAuxFoGEYvg`~1M-(YLbGI3+?5a4;gsJTy)c#ca=eM)}nhR??HhksWxfF?dVR zbbW~0Jj-R>OM7lmaw^aVMw%gPr2Pgu0a&=;&{qO@9bSTG^$xgXxE+2P0#^nk+lzWX zJkt?*20IX+AVM*_Tq>l(Jn~owrh@9o#20AOh44K4q|{WWp|Hvw2g_Wc)ysS)DvLp$ zVPAqOBwAyQ1@^52Ydn)h%9jJkymwYk$vD;`n-Sl*WXhV+O_4@X`R$UA({74q+!@kq zvUt!!Vy(W99W~-sgjeX7vsVM%3mmlO#{`XoOy96WaZ0lxiIvvdD> z^a0q{tqW6L{B_0V*fD*luP;Sq{($DlwC42uHqn^ws3?c~g}f=HgX3dz-Z@JtF?nxz zk611Le{QI}FSCCoU+28b`-+DgPC77c$~)m0pT`G0n^7%Z<~QEe3G&+B%9vFru|LkN z%9^$kd@YEC@L9w@>|1@e*U^kJb2d;Fy`x}Cl0FwVx=2f#o0NgBZ(4 zm=nmdAM7aq{U2a6gVp|)adS-?#)%3zin>hJ+)?s*INp`8MiWLYyarXpH#k{ux%G%q z2;ptXTNAw!WKBZWf`yU9<&duIvagy|iJnf!ymLAOen9Zixcas6{y7|vYv;-c0!V&< z7CHP(ai!-1@d^IWP@&>Bbxy1~n8~>%zP|u}c9Qj68*_Dt!RxMv?F}WhTmu4%ZdmY^ zk!?r>BWq8rpEtx<>Lc2Cb}l&f?4RARNnxyZcV?W_f5(GnDJkUWWttLCPq-3!>cz{#N7A{Q|yL098`};p=A~gkhR! z2(#Zf0{4@{H+4v93uuCR2j+kCo|uFtpASG&gv_QOJUtq2rKgE<*F1=Ve#+<+w$-r{ zPuOOTVP?nu*r|I!5(hvp1Zp4R|Vd zILwBen(a~p%0OvyJ8I_Ov(zq9rKw(k? zQFm?Nx&~=qEO#J`+t+-M&40}o&SYJEAog%4P>=o>_^vdCA!yGnX_cq<7gbl@oeE(h zJ;Q#kUJS$dg%X55i!xukq8V-7^|D$D04$*KJAomAd&lKzAeQ#0UTyk?GW{8R{NFD$ z<1ieH5CYrT@8CF6?ss^AWV*vuek=dqUI0J73ij_efaslln5*R{%$!cWyuUva%$f!} z;sL$ox^okfA=^+DoGoa>XTPa%egm!XWnE6shqw2tEV!&0lBRRI*^kL=T{Vd2;fyl# zz;oRUxiVg6bqx2*`PTmWpWFJ0jFB(Dr#;c6c>-H$Jmo|js~iI9Ui8cs^^r>l4(96| zf;#p5#r-ZlT)k~%hZtF;QeCET$j~D)mo83UL>WD$M@5d62F-bF&?MjJ^v!}>4H*L) zV6XuUf$t#APBL;PZuab)ZM{|LH~}DP?pccicx+rSsmLS$fx~tzA6nwH2WR1FLKoOX zUhB)CQIpKPX2hWz)copWgP&Xp2TU9+_!*PEpLvf3ss+@S<%rVE+~=uoVo$X0NVPPd(e zRmc0d7X&@Y)tQ_KtfmM-v#kHxl|%6KAvHNwa}eTuw8L|;k8hc7V?FOie!?%Gr2icg z>oCGnKyP|WpieFd?`zmMJxiPqw>pI7u3@6gEdl9_sZvYY#UZBgN2U`0_kmJimLvBB zD--W;Up^JY)vmIW?ZkOaQC1qK6|9eQxJc0ca4}W)xwb1a>$?2ff(tEugMT_`Qbb^O zx1KBMn_QjB9wff?>s)RAEW1v3Q>_YQ2y3PL_cMO>R;l-m3>ecs{HQwG#_HC z!&!3!@%b`ffi}^gsqp;KG65CS2tBK$^OU@kC3Rdj0o!}r^Nd)lj%2sozYawVV4<_1 zzIke!ojY&@{KLTkvv?11TY&Mp3>NF(qr)T26|bnq?qre9!AtHAHRV89XaHA(Y|2Vy z`AvqgJ=8L5XC4c8v6RcpW zszJCWQM~yuM1MMC4m)L0EN%X;W4!Jw7dfH=l+@8{c|!{13xohi7}AStP(HZUxT*_>u<(>Cu?>^WV{SEZ0Egam z8}a6u1t9y95Z;MUbHwolSJSTgudW=rJO4(IisY3iS+;;P&_i=BWV7olVpN6Yt;2(# zKm~6_th`;*953w`A3{y2_*N4j=-hvIO-J_;uDxoy5yIt*Qtm-wgRIq)Pk#*(j?@zH zZeU)66@$mH>7qNYT?xGytoSgV>xj`J_E0AL2W1H(mzaybsC~FcbMg>oGN9a(XC+0@ z{zyidk2mj{7;h&aT*5gnI6rfKms5^7Nr%c`CRmz;eaweHy}#t;!pJ;D;V!L|yFIJr zYr7B700MyrfNEC}FW9-dd#`t%r-@P>(JB9Pp$%0U*obEP4ul1x z7FbKo-*gMds732Fu$B*?x}UJmYa0UZY>(s+T-n<`TjN+^CyR7Tp8R%GJ921Qql*Xt zCyk^MCG!4wBFbgHT1J?i!H z0a*W75*_L9nHwlPK_L%1MH8KS%ddv(f!U(XCfP-ZxR_iDr!JWQy71ufxZ)paF%Zpd zqTCQJz+3&#-R~%#irSAjgJq-%c76K+H2DCS5=|;6S^}gA{PXWX9jnu(<{K+E3CitO zNkpvJ?j$Mh)4T&DKVK^kRn5KoZvDO6<{^BkSNr(0V1BM*P|`!styN@{F6_V0;(fSc z@jdj*)!(1!??BnFVhs3pTGnXR?Yqn&H0KFmE0FbF?^pz4d_k@Yy*x4|viDdSPkz5G4`ivKW?-Hz%cxYRi%{|H=p)P2^gNVwDlw5 z0SEKm-Rm<#HmIX`p$+*6}=^A)vV|E`^NKmxqfo z(gQ=OTN^#~DF6%&V1){drdELa3yuZ5fU6r_qIb*D z1|JQ0s1~JeV#vNU7{O*iKtWtY4EYC1xKGyj{=5rV2h89m^K}Ht*lL$Y0cPfFYrKYH z#>!L7lIK;w`4@MJJis5L;hGCO(OVkN1|1oNBtrlnGlY*Bpi1L5iUz!2*|9vt_;gF@ z?a>qmK#{OlrPCz3u-vHb?rzXc;17d^ltPE+Ejy7j8sQ#RSvP#`E<|r5bLMv!k5Qzm zUHGwI;Ztx{1cyMRxaZq}F5Q18OjJlC*r2FoV?O@oRQD|*^|%n80v)g(pnLlr1eiH7 zO|c)4&|Y}2O79QA8(s=+K>Z7vSrMR( z|GH`a3-H+p?ACT$@|*H&NT7ZC7|bo=vc5~#nvE3bpG9mdIZowWZ0g%%?)TUK{ZqiH zWp<)M1|ASUmg)K=X2C&}?A!u#ejGnmr;>4Rnj{=1?i(+(9^_u3z0K+%JT9FnYEXUK z2Eso#Yo^9fzJ=xK0GL@6paGU0_OV!3`rN< z2LEdJ6)>>&QKxT~TZ5Yj6x)NLCxhU80n|;W@f5B#{S}{-Ig`79Rx|SuAHK3f#2iX` zK!BD(-r;{PPqYpp%qkO32EqLDH7-I5*;<3QbeoQlHM$!IYTYZ;E|!)OaQV-5svi6P z__Q1YwOKS1<8ASR_u3*?d4lW%UpgKwF5rEaMmIv2k_r1)i&_91vZ~K|GMtNO5*3&= zD64>12SBjbh)KvZyE_N~1dsuU&n_ti-xiC5q>lh!VnOUa+$pFd30s~Tv!CO##4!rZ z$rm^H-qOrfz6{@`=uHuOXn4)HP|63ydrq+PX)hNX>s)^)Ii%!>YJ}Qy92%u>Ux#a+ zodTo$){|*I8=dNG=W$S(7=jL#k>5eD6}NBKokKgbc5m>8&g))~5UkAAo<_ZLMf~S4azX?=2_h1;pbZ6k2w%|X5bM9{DJDKU0N4Cv;kHcWW z(USOfViJ+m%7v{n3U=`7KflVd zA;$==RDki_g^>XO7R?}QV?PWR1&8E-Rzi4QF50C>nHq{jSo@bh;q%JfIwmpY&s^?* zX!)J-ye7Zc#n^@6qFd2%7T{o17zE4!Ud6Y~=#bwZ5`3s&R23?;3tmY}H;q7RpsrDH zxfFpO$PS;^t^@py_u;ASDD*w*zAs6-6O;)A3<7QuAc6T?c_WTB01aZ=MVkgE+{Ya3v5<{1aN5|u zg_@E`J3<-7$)f@wcfkQ@LS;dzazens`bfW{nJy1cMCR5PXblYkRmiIs=#e`luUabY zS~3kgQJfgog{<;u^I4SvoOa^ZkIK&i`T^&6JlCEmB;3wdKMKkqubHg~qT{E)i`1`fEngIH^rkyfj=)I-BqH^!Q)Cd9x+|eN?j9fD%K+1=ypv^0 zNe2$20v&jbwH4ljH{@v$gN6wIryR}}#l#X^-Um3M2`rC(r9uikNX*!Kb6 zfgZ_O#gK4e1S&BL^I-|Zs8(K^LbudB?CY0qk-pod@JampLn5ygS+EfGNH8JQk`YcU zvH&fQ<_F@!{O*G2jeq}g2PPJ-t1$`b-?sFO0bq*Ou)aD8qwBAZG1g3SyFhG|j0o-~ z&k|sN9RZH{+aNXV{sse^LkK&I@b}DU26_I~Qe_KZ3Rlqx8tsF|%8U>S<~pcFuX~(M z@sjs{7Xy8f(`e}@_~L9555SGUN? z#Sf%4xJ+RJ1pGV8V>@tMVy~R(c`#%Qyp6-3tA5y|kyxI1By^%3bmpq_6l=t|SCKIq z0Keef8H(60jliUDt&v}SvZZnhLSunm*@IjS$4jQg-}P<}T)Sdn|50)XjBX5T^8pAT zMc+6$J!4pK_3?!h>c()^_SNTh*f^G6K+$=MDcHCIAshe(a@Kx?T-r{S0tOn&3Fcp| z2z}nw!#~TtP9MSXZnFuWDeZooYYtxqmQx;m{{JqV$h+hpE38}&cP#nVPsjPRJQttK zuY=%vOdNTlAPz^m8=nI#>+k=TlF?6b0KRz8N4|~=KDm!1&s6@{S!LQxh@)kbKR6DW zJ-GsS@KFW*AcmN3i7Rm&{6m>$_97r*l4a1iUE2=^a46}repM)m0my`^a0V2kiBR7F zZL3JKeiuj}IZuY~(-|sYQ+OJ*1NvJ^f(5d%kkoBYuGZJqSKhl*&deo{0m- zgw2L$xNp_(1Cxyc8F=|tVBk0S4M*!xGaFXl<^kGARxQxbN^awU4uR%9!Ad_NIH`^S zPsCZNsw0$w4J&ye4!LL0=O7)(cWVi695@To%Fr!3` z3_v$>5DIm%+y!~i`7idmX7DX1EKmd>etni6_TqoZSDKOXK!k*9$pi8|Vs`1c0X(Q@ zyb=xbyD}?rzbR%&47){TOmFD-?*W0f5t!17hhaPPk<>Q6n{A1#`c^0>sdk62kTSM+ zGXXMWXX>d`Eq6W=6>ef7Fze18V!xjX4@PReMAoqhIXk$5cGD!?C>Kv$ag$<_p0K%8 zCFnS=&6KbnP!I0K9km%9+>5VVd&aa{vJ01>^ zHCA&%;}SmVrE1ao6iKy0goO3^FhAUfQ!~N9inFJ#U1&h7!io=rJ8HMP+=A5$tixk0 zAq=KHp(mUxUmBYo3%x+-NFjKhgLF(fr<)wmPx9kj_95>#z}1#2?2gUHavr&a#VbylR>9>Q(EMirMLqtDY5^o|2w) zu7L0haQxi^vU87WpY73|chR(1KKPtwKOnj{>rI=v*MNs4h@8eDksA4KCADRsNLGMT zGOb^oyCcxqtUm8uxLS=ktupw)n;!urGVm60+1ffG_N;#lrpHsqU~=na}p^})Mb^3&WX#~90J z*C}xI%acnFzYOHLeNOi%4c>}W=Xrn#A&vNK@3z*&SNX!|RT}Fc)skvP0P)FYyv<5H zS!SgHjJdoLd7cJQZ%zwNG0feoxi-lpcoiv_LgJDJo9wg%)?|MjW~h--7<(AmmTL)@ z)uf4fyp)e}T4mXh5V`yA6`&ly^I-=eR(2m0wikfY!ODtH2L7fmV$@T`mjR$TCFkn{ zLUhH!WGWEsn5{`5{6jGR8XJraK#`;vt*HKH%Ky_SG4Bz;Bsm-M8HBuW$zZiLC(@3G zBA#ng0B~?DkTR8j(m(sSAtUoPa6Fa45)-U($NZRQf2cm^y+zYfS$Yv0e{31V5|)Bb zKXt}(Nd+N2IAYF|7)A{3{+oC?BNDuZ>ts*_Ce=;5M-A^ml4RI)RQ&|VlFfy<3*9wB z@t|*^aV^le(2-pGbqA*aXY{H6mF6P|r< zXj`d0>!3)P%sYT|ESX{d$VG__iyw#l1<+ zw@S;33|^yfzRwxt`e26byG(5RWPzcz+o(mNR}5iAAAj8UYgeIcA7+W1hg#$*(9s|* zwOvR%58ZQGT9v%DD5;FjF31f%mOsX)*h_h=WH(}YU^ZW8!>u3Q1d<2zwd-)8y#60! z?*WeW|Amj|L6Jn1QOXk$GK!2cdRii~_lOFap)#_cmIh@@Llm;g2n|F_MoNW@L>ZN= z$jbWPZ>v7v-|v52zw7#3u1{Bap7%JfbMAAW`<(N~GdkgMGuWzp?>Ae%DPVH&lGCe( z!3-#Y>hT(gr7syjes!Dw6NgCgc$DH^D~vp6`JAT+Jtk(SOieZS^*;RaJWw~HIB$oz zK{{92>J!ICj5!O^kYK!8HkA;r(J#ul@<;96Yj=V?R1MK(f4tYeE=VN|>Zz^1rxWl0 zqKjRAKup|z!^U(es) zJn4B+NR#?L2J%u(DATUzt7}j-smwvzM5mSDe>$y*IP;uNXi7!j#V@{bP3h;|`hCP4 z(0@=ZW}ER9i~rA*N3uMFC%3BW&Zo;dp^bQbDXaSMV}_85At2=ezfrB)1Pd|*{iT*q?QB z*%6gpG{=Taw4d=ga@zuUIR}dwWt=w@THx>cwm9Fcz%wg&0i*H{j0br0E7scO{Q3#( z1?Re_yQlb=x_3l-Mf8Jk56Rv>&~ZX3zhEQxFgVmU5WsJBaQ8Qik98el*cDWluaS3T zqV@@z3&AY9*cgzvo($k|2vR3gbF(D1`97yI-tkB4#z>#Zeaa+0<($~Bt|lWSojV?? z_SoCJx&9+5un#>0fYNNP>99tvT%K?+ zceu(dx9l&}7Js$Zi`)p3ms~ia=pED1iw>FZzdP>*Qd-8=WYJ*& z{O{jS1vV}4wDZ1KgN7Wkxv%2S!Adlf#K?@tog$rLT3*jY7)QaihSuz=0LL`)^`gYN!{_YW(W};RBgGoC7TJq;5Q#c3kr5#_DpS2cC1qF-*cn=Rs3UqHs&ycBeqF ztgLi9tQjw?Gs6H>05fJko1}V$Wwd_D{XCkAgVk3`kk2L3-83}dGfOYdP0XX66zXB4 zmM%7W86*&A+7-Wi;9mDSRjX&Wl1H_mgtSybFUPvKyFbjwu$liUH6e`~t=i@{4lU9* zw~F3ZE|J^bD*v;kCuBY2^3w3M@|zdGmL5hj&9mXCh_bfZ`#YiQ?Eo^rq#KTgUpBaL z3j>haeQxNgN`||-Uqm17md~AfTc~Qh-@rQMrAvs9c)oB4({-AsBIptPO7DJ^As+e(1fU_(ip=5bll)#P1035w7IePz4kC6W z>}Wj8A!_mDI*a+p&Rvy@b{y}DJI#h9v(o;VWCVdshb$OPJ073)yh2`g_VR0-53_>p z_cYaK_mlj6(^S#*l(q7HVW>NtK)LPSu{;a8b=I!}3Yq??umJ_d5ZLU+Bh1+k@X#1>tAwYX?(C*ON}#vo%y^Da%E%PAB$m@YF?C5mCdsW&l{t2zvB`E9Fx98eT}L$SCS7@h$dU z*XStC``2;Ko>wj!jq-Dw$%dTk6Yec~Q+-n96z18>{oDFe+DNzbbz?y*Oi!`x&~x>CJYhiPuAT>5hfQ}iUSYs=g?W~&j`Djm9i z8hVjw=_Yx-J!n?zzZdfxTgcSY!)CTHdgH_Vvn^X6<~Nd0<>xyr^k7)SbMkphkA}9Wq6F$V@+~T zi9oF|IsCm}) zaluVZO*N~^hle}kIVxDvveyQLMriI86J^;b9!j~fT|5wV!Y~1`>V+Mx(DbD;C#a#Y zs4bOru%5lRT3A{l&Cxcz-+Gm4?)J(=-&8vsq4^VTru4C?UbvNmTxjrZt=1zr=51SvB2%Juo7>0FD*Yb&_Dbg1Os*-vW-#`% z!&xc}H#>dDvp?2~hHUfqGd{W9RI{?NF&}d^d^A@+%b>r!a{&zW%#b;?a{)Dk zH6%F9s`HT}Cx1Ri>FYeq#kSRKw=iLTQOjb@_8ML~@u=qNQb>H-ZtIL*PQSWNdG>2V zmWQ0w9BmzYfx-rmtMs~?Z|r#D^xzN6+ zdpNh?b#ZM{SH530E-L{>0e|#T^@PF5bgo9$ zl(^>4s)|Xm>{S6pcTS23hwm1xFQ=?o?=WG@Qls9M?kG3jRWFrm44q>2&Vj6J*RDmd z#5g}UCZ6i_qn&xaA)FzXEv!-)>Wy@Kb3})=K$mPfRFbTEaPHp~@23}%${aEr8Xfp1 zS*BUbB5v`dpi4u1+g%gy%RhKY@gjXcKHLE$tB`O*V#bSk0MN$m$dy;%!8c@Xq9W;9 zvPwo&Ggcu+(mHH~j`d`C3XKoMzfg@Ny$O4;n+l5G^TdfrA?+RA1$tLh5~+C&_ZkAY zX0P(3rX`k#>hz_2p2D|uuFVQw%(KnUtoxDrMD(Bi(gPplz&4>FK>ykeg%di8zh!I1 ze?15ejd(QKz}F)g+90>ehpJ#`erUh@-OajR3NX<0_WJDAyTG0X?PQ|u&J9Vsz$tO$ zORn;L*ulyyx~KvUo$GP?4=UAiyqx+om^M1J-E-*0dG^jGEBfnDhNr~bEYV5VLDG{O zM1@uuxbp5h=lO-v#!((hbie7q7tin183z)ZysG6CXRm-#T|Ht$h zj4l@^0LE!tPaeDj@hj}RHf+uzEFcZ7@4NZ-RW}B9DBglMzRbJLuRKB+W5}hph2;E@ zO4@gGi?x09x-y-0>JjVQKY6~p{VM2p?=Ww+#j(1~g7S_DIfoi>CO#~!ZR?sp`;PQY zc3=A_5v%As{{87h{sZTx0{6lGmld3EV>%~LRGJze)`aLn-mP=pfc3uIb(e_)>eBOR zOH4k$zu(qlLdMxv`co6^=gMT|Op5~2KL|Tq#Hexmj@L2D&er-G#v=ww+q$dbPlc|V zwJ&N5M_*qb#yFh$vwT#TXwS4iTc=Eq`bW}`nNoJ9i@w`nD*DYUuDvKVEA*;4w~rp* zvB|Bxx!}0^4F#t%{_#RhN_bCCkNpI(We#*h=dfV^flv|h&!>URt7@I42DrO(&#l-H zm2P@qlC9`)d#+Y^_Qc8Y^O#C-wfdmbDN=X3dvc}g)UVdSTJp%!Tu$@PJE#ZT(HvUVa+KoRZ45X6FPf4YI8JYh*nP?;}`8Ru6orr8#zil`Ckj&_0NHUQ8KKc z8NH;8&f0$#pRc*sJdTqK}lgh4mXm8Y^|9BIXWd z^5pKL?Z+m>_Qa}$-o9d2CQR40lk63Ag^^8g^5*d=$dM02;b z@IK=+ySuGWGj6}FW1O*jAKR??Q8qHdf0tjk8|=Uk)~K>tycT!*oSTK{$mc$nWy|YY zs=>*tJ5>4hnPv!? zD&R30DYJcy&DJ@A@>)Xm>gFFAO9CB;dz*7V_`j2;lqVXI=1&vfQ_Lf_8}x>mKSgaZ z!Kt0|(~U2aa{+~Nj`1{O*7{#$CVH#HI&^Ksq;3+@O0J7r#y)(bAaA6%+bREdnMJ0M z^g@nj?_Z14TeS{;8fD6JZ1$Zj-xv7i^O~*hUZHH(6RUrq2T|vahFIgO5$~vVI0h{( zB%K{C83F2xLYWVsR_=ik2qU|o)}fY+Fmpi&*7MxEO`JL^F+mG#L&lr-?20TYp_x0_ zVNQ{`^4@+)woRo-THdvrk*hkJl4zrtuxZP{iD$hA&tTQFx?X)m+K!CK#6Oj?Rd4LO zbB{O#pXwtf+5%n592se^;H>wol-qm~Ssho_c!y3Lg{HNtzgWnoukb}is=$MO?;1^u zi9~FI_&m!TYIlaya z7qbq&a(Z{~XO!>fV^Q}!(i+ztDHt)Rc7A*^z>VR>IqtNpriFF`hTU6_X8AtwJ0CWF zxJ>cKZJ&#TJTc~amfKH*pWh`5i5H)~o3cSFByx5>*xBm@&nC^xc1)Xda^w?krmvQPMv$Z>kBMw(n& z$+Z27IUA!QIdh`2+iyp`TRO4-mUPw9+@bzQ7YGkQV5e(iP35hS6X|d`Xgt_&;7;+JYU%g!*ew`7~2kKs|C4>X3=g*pL_{ zdQ4_$%%rX|!qtgP9eDRy<>b(ZVSDoaGgXDV#v0dJZ=~tIY>>=aRblr(2Fe;nI<<&p zKNyLj7hwDK zCC`?vmUE#bT}Exi#3sblld+O#JEiuRIrMIDgEdblfzhy^0?=M3{Kf3G^!+619CGPz zQyo;&IYb~`>79;?NTK|D4EB6!ZN0t5glnvcSsD~*M~3fk&yCTA4fW7zrN*Eto)-9Y zWyPOi6uza_tQ{DEmcf~4ZzdKCwQhuMKzAX#ly~q)tkViH4J+Uiv`EqjpT+z@ungmFm(()UHoDU-t_ZR76XS2ioiIzuaD;rK9o$|ye z0ST?!?$1{*dYji%r@N{gTd~@lc`t5q5Cx_QxXVNUBAe5YNNG==50{o27f(1~AAq3_ zl5Q6iNd5b-%6r`FAaBS;^Y1nnrmOZ3z7o%$@CUmn2J(&Q65YF2J~MiQp++TZU}o|C z!*4I~Y~hnQJ>GT<1(KUeO}*|k4F~(4c&H$(c%=oh@TLZGl(4K(&@wGAaB@<9=v6E0 zH#z>OIU(jX$cXCuD1sWWoRYtX^!^Wn@S{yj;U|m0ezsBmkeTWA~D ze>(E=C}nG&De{*{*DZdwAId3prxU|H=7UvXBTxxOZNAM+q;Y>Un}9)v} z-TvaT6!VWy{GA6x`n)#{3TE5oM_ic#Eo~o0Amq&5RD@=ShH!Ffhcdg~GG(ke;AAF9zjQhm*KSWlDN zm2yYPVYOPgV6bZ%E5qIhp?HnjXg;1pkA~5@l3dXViGY07#Jf*cAM1f<7ydp{(rfXA zv;3@OL)Un~=PIuT2c-rTv93Grk?xvI&Ij8o(b%3o)CVS+rb*$2ee~l9|5~eo=K)Li z)R1R0wOJ$SG9vqpk7O4(qP$eSsXWgQH?&&z$n}3MSdXw^?=RR_Q-=nF?qB;%+>LVW zEPBT2a~0q`a8|h`t#90S^nKMsn`1gU7Om~@oN%I+H%Qr>hjHxRC++*$*|`;Ub8tSk z6A{)*fFx>1&rw?F7ITV=Ud)aVHP$IKlc@^XdkWnoe?uliNI;je^sx^0LAI2@;jxs^ zh5$J_Ix8pApfKiu8?sCUHy50^-ERHGU5q~v5_J7=$z1css}w~CktipQYF#8wI`~I# zZ*J>#{@k3<_1a~8#C5P5OjOdEg&v|zpFbvRKldz$@F2*-srm-dGnH|(#p?(cQ>qRU$m%#$=3NukXrjL=WslAnzwQ3XLtO$Y9B3i$J+K|x8tJ`FsqDutZ6cQmE0=&pPyZ_yye^44Lr8t!smZjh~?s-=Y zGDD{iNx@NVbElX!a|@09=(Fb?pW5%JI?)W4YTpXF@i@-3IbKGVV{1@=6Jw5Tv(uaR zgXh`S>*tG^UMwB9vOha*;WkwGd$cT6zI?em$%AmeNnTev@4lwR+{?g}`)$h@k|_`* zgZC4z5KH4!@}DfgA`Fa^*PHjlmhr{k-rls9aLaW?x;{4Cn*j^oO(s(5W4SBpYjl;n z9+*)4d?eqjq%LP%mF1ji_d!tX(4>~2z)g7%4X1Dk3#GxTW+T7LeX_mIIxCC32KOrJ zV?Ns8M!JKbW(S3z)X?kfy<7XRHbr-j6MK~$PD{fjb`0BDQngRXXM`*-mKZhM6QkXdalQJ&XFuk>%xo*PKW3|QI$TJbFSn84Vpu9)*|!IcMy>T9!k)S(ERho& z5PpbWb35(B>&f>YKVvyTLStRn<>d!oUApV;hRK{bjDEkrhyH8#YJ}Z;7Pd7WbaGDr z&3#|g^rvy2&Y;$J=F7W^f6*;oE|TXsfm1Lw(EZRy(*VoTUOl%&1hZ5leg6CzAzu~W z5x?V1$)(la6Mx)v3!??rV{!udK8;n6HuP}&r1>x72(;evHaT|1m2JHscf+( zJ>RZ#X!lxrXsa6t&0Ni3Kd(hA>B1>t>r|@=V9^<0l%@W>rayb&N>u31(a`-I1%?9+ z?i`)M{;fC-cI_`9gZcF!@#-fO>yQiA1jTrwKG|814wO6UF*=o}DWVR^>(#GRd4|01 z@P{{rv<1 zUK;5`g3vfkA_mkfBzk7{9+GZ#?rLr2Z!e#WeIJIy7F|DJFR3R(T~5ih|AR-}&!7BQ zrNFT_O5w!O+?&*Mx$%mA<8{uecT*b6YZo_M$^{`4#(REW?t?HN42`LdWk~l*e%atX zh%5tb;WZ6cwjyWBix5gXj83?_HFe2{owgok);_}kNm_s7lN+{R z+Bx3T|uUX(Tw-ET*7w^d#<+)%N(0k2&Fp|vEQ4#ly;qwN>2 z{*s;i{X-We*91$7;s$}Z&iEKl$gL&!eNw00A|M+*v8Kpx!hZ9`W&CH4*U2p@E3W(g zd^jG^xDhk1-qCbBc=Pkd#X&d>x=KreThgi&tS3P*I|{vf?B?Kf@?pWH?42izy6>*% z7S>spD}}_sfF;*RBrazNf+e(Ro~?b|8!!4E{fOo%FSIm`C->`TWS^IVl*Xj&LPT&d ziV!~E;CbcI!93u!<5w5t6o4s?$0=joZ%f(#MNY79E|N~81g`}~TQ!%CEX0Rk&oqD9hq>J&2;*~PY+E=zFiRnqfX|X;dO`N@S_hUljA1&kTBZv#LQG|`gSVMn8 zXPelw~okSbRZ=z9YsHij$NkhJHF|+J&lo~ z7?L*}Fnu2@^HhW!e%3mFr5`Lm|{<51`RMatI#7B1u46=IQIorX9^%;5!Xn5rtqp&H|q zF;R7GR(*=L|CTL&#Uf%3n?SxLUGyoVHg@NnJ1$;KpHpA+{yBR350^)%x^HO0q#*VC zQ8#F8-o_%T{lHW`t`E(=i`by5M2F}#?uc=^M5kTuC?1mN7ys^VG)!`J;4J@4>fafi z^HC`y#JYo1)QB~X6p1=3F}e&L%#klUYVMSpeM64C_rL&RP7T3qCqCR!dD^P0t*x!2 zqobESQDkST^tj@RDk~(12@iJ^b|OIbU){TwnrBSSS(c2MedR%;(C8;es~8IK%Wq3o z{TY;al+t}A;uQfY^S(qf>s!V1oN%@^WnIFxk@9({Qvh%$RQ7D!&29eoIs~FfON?_j zLFcQ(ek(StTi(iTYU6>f(J_V;sTgU8r~>wCYDKbc;$F;cS;fM_GAg$Ji6B%?r66D^k2UDba96Jz+x%ZYn)sPN8h8v$=#{RGAIE7qrdXz zP;Q!Q6hZVch)|(;OgDX?VY3Y7hw?D*@;54evh_J_I~vHW=_H*$U%!6k`*}Q7Gc?k~ z>M*Tw`fiS^`U7)0Hj+z*(yjqS=kjl~_JQr|d5WZ?>}i{gI_%MJQsex&ROL|zY7L3^ zT0j>l>u}_R)4#L6y?eCW_`+f<>*K-;L)Mp9PoaC7tI~&dRM-JYkICJWwaSVSG0l*X zM+T|$MeiO)<=h0P6x!-DEWG<`??*p74|JE@Euayr@JL;erHS|3r=})rnY`LfTOtOh z#sz6_EGt6?Jznmcho^jQZIE7vxD|PURKHha@5+R_R%A5*qQj=%WW%z}zA5S(#UFH# zp$F-|TJ930q2$>|X*&A=^tq%}Yr5Y3($}7bhK7UtgnHHM=?`LyuTCzXvsiuaS7d)r z041$A-5X?XQxv3MLR`CGRbWHTB&B)1ig>#A?X#Js7Z!9j?Jx3EoM_%=uM=!N9E4I> z^5AX*oy+k(KEPpNs6?bT3H~>!V%;b~=IupkrmWlyuuImYe9^wJXlAxt0hXYJbaVMG z=$4JG>(}kSgd0j@np&rF#k1|jBaB30Ie&Oy2{Kf5%inE}2Gb%RyY%EZ9~-@75yc(; zs3^*O-v%62vLy*($#DMzZEwM}_+~_$QEQhUyZL!{65m7)Gc(9`DyC*4d-l=u8tB+} zn>PDWigV(2w3P*|qnX3PmK#XfY5`qq$@kNOZ#If`J#q{e5?H*=={hP)h`$jM7LzCu z+8+`=Q2u6kZWe${-9Rs|bjr(jw|1LmTd6{FoLdmHQl|hSO#WEUULB?BB3qqGH^6^tNEGQAd^L zm5tk#_Pp;m@D^IdRH}D2*Wu&U&iJm_J5k&!QYWNJy>n4XG7&3>CsCN1IMitn#%Y?e zDa4^NXEai`Hg9c!qY9sZGMu-e;c7`KN`1yo!RXSCh4$ALIBDP7&y|z)ClTFMTTTvj zd}5vhF4L^J3muh?k*#$28Nk5}8Kk6STRn!a7_U~33*SV!Eb$mPN7 z?=W~8!N}3yO5JLxx_7DH#N;O$YSoI6K?`eb4r6x1?h-^2EA1#OqfGgSIyRuQ>d8po z?>s8fCD}TT5i&wGO&Bb7j^Z6OzQf#`m!TMLK0|ZepVs16{wq2sP)&vhvkychFTh3*@WvD0o(# zwO`^BD9C#vMwtA_^Vcm4n);C)pWSq}vBwRkj=PgLd|hM_Ol|fv*_i}Su9LyHVsD2$ zzbU2EPgtj7`KE)rgl)A~RV-!eJdpeX+ko|(8R4RrL>}qWnhP<+1EGKpZF{h-h?m3d z$NSyoEM1nzm(a|*g+ErPESHHmu z0)100Ek!k8O9mq$GJEz6O16$^VLA0Qv`M*1^i{+N|7(PN&kR8PMRavUk!^(pxHgC#*J`-L#f zHREk(`SDKU`F~ctlJ0uCljZ>j3ga`S(gzGW;r5?0cg0)?w~>%qit_>7FxmCtM0?mH zBYS z@g$nG*QmCZ9CC#g`7NlNs;>TC^sFeAHU(*0eA{A(gcaI)K&UDE)bG&N4p-5sdTn6_Prew zz?DUZi8p%RoYL+D87(fbDD<-CxnLF0kzeFL_4&gG-P(jH@P$3iMG}FMs}-;hJp1{n z*W~Dr&z||N^^qq#bMf->-j zzKrB$N5kso0t?rOA0J1rDzW_|u_&xmhwXz07LRXP!E zy;l_)dV41U5ABB&c(4S-tfW1jO(p)J`pHde!~%3~9GrQjb1{ras&}+t0zT?E5f*={ z9jr4dP#i7)+%m(mPP^7jHXzuOzs`ibyx@ni)U7nj_;-KK{AmPhl>he`wfkd1PI6*Vo>5HC69rL(1w!enM5SLWksN$apt4Ap=r&qqbrU~=Vek7!8CUuVA;S{Med(y>eb9wn}7U#DrJ+__H)ZJHK+`4-k6V#rwr{I%TJHx zBVfY#;?k?3{hLB!R-OXew(q)}cIqAJlB8ufp;eDZA`^msfmGn&l`fcAeakfq^{k z@{F_DA)w^e8S%@kzUA4uzmaUl8~g1h?>Nl>qM;YbqI)<1JqwUB*x9)UaQg8A>xEAT z*-AU_N}ex&=>0~4zEkFi9bpIo(o7NJ7P;zYQk7Y-FhUAU?M$@WD8UM<6LRxWCtb|c zg1)7>(Axu4!|JWaRbsC*UmE8N&61_Uk*OvU4svTI-e5(PMfM%)90OM@PEX_tC}XDW zlt0>b&P#>x@?=Y162Yup-JFsx9|D#M`ZgkD^}8Z&?{m5ies=x(_0sJ9zw$kL;btrq zWcOU+QC6RRLVtgNv(Bz-$41=TF_mm2eil!B-u3_0ki-jtX>$1jt)M}n+fNO}nw{{1 znMbL$tEUD$48~SDwcjAj2=84#ZPFq|2jv)Sz7x5)5ir8R7aJQ54(opU$$WQ+xS!MEFdt601XOv8N_ zv!mzpo~WX`L$p9<^ItP z{WlS3IDLzFjVC33vUE2WH9vGKI~r-uv_f{;bN``#Z&PZP|Akk18*=K-2N9p-)bOcG z`7B4@F9T%TVE*_w#DVa&4-xSpKVGMCy>5F1mmq0!@sibBt9$)bjU!%KY?x804Xt$n zp^E5<%HDG>4X|q{EHCa0S~8701DY7%X!TcLtTrh^o?Ln50;YJl!nF>j8{A5YU7U4A z@)Yasj_v=r7@{1TEp$FQI$G4^4j)8JpK5B{P@DP3$1sS|tREbEQ`LETLeVg9ca*@z zAXhQt`}}))#!J@Xd+SaY!?~1?$5wu~2bYIYz&6tc+{)mCk^oc}XngV>={r!{ckFs} zZ0N#A#b5+iN!uT9Enyyf({E7+uSQUvbGe)42NDX7;@Y=fpn*GBTjgC!m(xrC-X*Lg z0@=KwvtmCbYMh;XD%B|KD>OL#%0bD3H6t)`XuQ@)*#upUC=a$ zqP%G9V}IfB$G>M@b7AiIO}iMma{;0z>zV58P7w`6cQkI~teY>}QW%&}w*B1#QUyY% zaMxzOmlf)!nGJG;plB5vSKa->SN&&{0~Hk&FLE9}%=Y}$0vdTv&#B2mXSmX$tjpVx zxMKpdrabSK=R3)A^lKEo)v!6ega_5SH-X6Pmr!s?y&V#V#-8>x*mQWa{vdW=Cb`c)u#vXaM&*irS zD!7SAVY!pU`R2sORIL}^Gu1DeWm-~QRq(0q!hJnW@+liaf|=T;ihReCf;XX1fZVi( z_*e>s9^5-rns$a|-|hek%*)8&O2M4JTij_SZg*z!3dbYl z(Fu{ckH5aUoH0)?FQBrrL%jJt1^0hqY{;p3EXiH9$aQ#1%!2bbOY0>#p4X&(4u)I3 zY;X4Nd=|(eQS`3t@xiC#>PJxWk0+DW7*~38`bLS`ck6rgnWFer=VP$GnGp zRO`1nsnY%j&BLZRKtG=p^%s_JN;!1g)irc4?a*HRd5BGZLHF!*T{utM_H40i?L*|9 z_l)jFomz7J03Qh4$khB*JIDIlF9rq%&b&D?Id^A&ygKrgTYD~xU)%|8QHgr+b~qzz zwQ#`nx~Ik5BvQOMc@-xh017C2-h2S9i>pxQn%5E3U7$d6l)Lg2hPqbI~ybm;CE>0s_nzTfMnnUr|xfG%=NwQo1q2Jl_Bf zcE*)2!~KlAs)Toh=+FtP80EX)O<%JN$&+qy84}3=Is~@k->cTf=-l3)iy9D+ z4SI&>J>E}F2+Dgg1E|_>>B`N?=ntHsYbnJ&^pH#e>uVdQBzXS%ilU>m)c!3*rf%Fz zX}Irji(5!z?fi)&PVG_w@>tVdOn%b5snn2cT~+|Ot-b2fIr<-ZL}I|_9+z#;<9j6J z_05a?nmkIk9Q|gIZ4#Z$SQPJ_sPO7##qndE<5^DSf4U)k?Fd`UKJvbAFD+| z4AodN_e>_Thusr$1U&;JQ9U?`=oyb__C=wL6}?JGo(@+;Ro2w#AKW8#px{3*a{-BC zajC3zzgu4ILqvRx&ylo%(|-5iYwBi5HHP8H!P7jftwqLasq;)kD(t!<; z3n4Z$oUhXcK%97*F==dThZ6(~cb0z2)p8vSfzNW(i?jT~X8$&~=*RRWWr`8#^3}Oz zJO7BLG1jK)J75?-a!}(!uUt{m_F$@0VZ`s+rF6t>PE3EWrRByl6qGSRny0tVDy~qk zMlrDC#}DIMFQz6tjIaRLeo1;K{Vl2g>&4gIqM!(B6IB=VW`e;l4A zo2BfjxYJHmaZmBDe0sU03>-709-Qp&`NTv+w?i||i~FO$_>E!TC+Rwm;J>SWGHEI= zpx$Iq@cOMHC;$#}e0}$%q9RIzkeXL~rbh-7m2(H;64d;nqltBkSR8myiC}*vjiumN z=TFjxcy3q_e{|1fjy{V+YGb4pws+eNo#P{Xf?? z^T$kc8e@59937J>NRq3uGwRVTdU1W^Wh) ze(CZXSu!Wh*s&Wd75NY}4XIuko9#ZN1rlHOfn@jIT5Bw0!q-p}#Z5});!jU}y-`m? z>Zpd>bfkIYJ$yv>a(GSEzG)Gu%v!R1;gL@dH-R9qced8719aNvGom#IHRpZx_~QW4 zyYEq#$I=JvlOsS?k>Adq`M;levaC5I$iWeOWo2at^CVBb1Xc!Io40qkrRjY9^slL= zHc-zch1aiN54)psXc`aSWs;NMqaGobvT5;PZ$6<(r?#*^JTZ`z?yx^WB@{BJ^4G1v zuvfiM&&a~c;cQG|EBYCC?eS>^aBz1ixXbq2?+Z>0q$*<1>>bhaqb{mXoPz`0W4&rWr*moh|2UBC;;fqvzJMfY5JQd%2cUf-mWkU|k^lL?aUwk} zIx<>u^yVW6!BN* z+3%pHlHE6ou3aRWhL4U~>xv=#LRDi`8nRu4nb35(o9?{~nUaG4q|ZNJJ9{4+NC4xK z;?ZtA@LSU<#ea<5D6$x3cocBc-0X1pkt^U-<4OB)kgU)P>=I9= zk%BM7i;Fj_xX%34exICHnyqy24o;(n5l!NfKd>f;QB((yq#$}$lJh*bQda? zrO4h_?f5@MvNNX6a6JGbP1gh9z=MdB|L*|+f!B5|8&D!qC3HvkHJ>|z`6n$N9sR3- zUqGUA0GL)C|8oXmceBo>gC#Q4j*f+TCF>wJt@hNDRjP1&41!b~W?YL#Au+?rG{-gz zFc*|P_7mmfZ>CAbOf?!~e``d|hAOK>4iArE9DchKv#5jNC=~~|_tPV(Qs)Zh86=X) zMn(NRHJ{KYeAv5H`9D=u+=@D5X-_jwEF3}gLL~h-0;7`V|C}N!fvR1OOq#)(Nk;y)~7hM#&|75BG5Wq9AMn5Wn`xWM~kLj?|lp_0oqEL1(0-Il;Xz5My}m(V0cQ03BUK>Lt8{l z`ogpZb}^I9xoMc{73Rboi+b^2z(P|&GP(I5I&RWqsI>ZQ^UEn11 z)gv$^s|S5xz~>(gp7;&~R?*Y9xMwLOI-?de6hSQ*C2{Azb%l&&@q)os4$aF}ntMgBEK-SvE@y5WJm^ z?j1kdaH+!@V_clK|1o=570?6{bCinT%h*Divlj>vbVjG`${@~JALVBbySI>STRIye z`gp16HDFn}StUw!1oN8Kg*>0SZm_hi8g3?njsW$n3vk8y9M)wJrh8SQeX8yDdn^W? zSC^93wLo_zy%00}rSBbecaK_0>$X9ybvfP}@qqKh$;5q9otn#rh|4Js zr6;EA-Lx9&tWn!NGSqE~7i)c<^3O3HCq%P1+1|~$SW&i+-URCkQvE|po~Zeo#i(J0u&jOtrsGu6IwFG;7eKmJ$jdf zp{7q%4Nt;Ybs+UkbcBA>;-;9{!;MZ2Im%&k*T%Y0p!mqbB&*U{Z6IX<_I+xYp|#Lo zLIs8c4ggc@j$`Q6yW4wo(Qd@1ipRPe0cW=om)p^!I=%c*%%|%AMCp9~sY($;@UT?i zOoJu*hM8Ax)*`^;^ihi;g61Z^Jjsnh4F7$qrkp}hM|Len(a_b64E20eM%M$U#-x8- zg+E+HU-TKGA&U4gIwOjVh*pLwbcf(t5r_LRga+N_*_d-YQ*9%c(qjpT1oUnfQSvQp zsHqtQQ@wps?#MJJZ|P5hxq_31f>+e?kSc}1^9FOsm;ZUZq9L2@ zbkJWykm?MGw_@1x4oEkjVW9Da5b+fWTFu@ic_RsoOL&E(&Feall<*?^pZFNpE_(|a z%(CaDG;WC!N0)F*bx0$WMY$%;`hW(yVKCikj>V_2b0Pd-wn4QOPPf$qo_R}{U{ncy zitdbHS8W?htr_}o{&L;DxPt-`(vv2ve)HLws?-5WUT3ki&1B0@9WNqtMGWc|LK#deND zfZwNo%#K`|KHnEfFqxH-R_7W9Ai@$g$x)eq^AJH^TZhWQliOf6t~%=4W$;^@&7E!H zmjeP$F=0Zrc`f%i+@)S1f^9{50O3XELM9v=@%BG6&i-0o5eY(@Uc^Xng;;;F8K#Cn zI~I!!^l+-CBLPC20Aqzn?Kc7YY^iSCG&|H+B}PG{z##2x$Kv zuVrzs10Kx!>`UUAm;nLblc*k(jTIv}-bKh=1dqZJ0BG!HNUfion7~&6RsghA&Aswn zI-|+Q(H}hP2#TN5CsZ3u(h?Fx<7dY9OZb$7zPFH<@J@UFcXa3x7zG4B-t(#c-NHAX z|Hg#{G<(?3;UxReHDJ(~TMJsf?iyyq=KVzkc)4-iAQ&F>c@qyS-I$mT2x0&l^W5-$iT%*t5J| za<<6P&phJ)##rKsj&X3o`GQN$*@QeH4sP~uD(qrKb-ASASu$;E0Rgm&(SGj@$Bx_= zM?ZG-%eV5#w9X=b>~1{xMBp%S;|?4Q?1HaRIW;#kgT^^wNH?l8;;soMt4hosE3|+j z`8?)AbM}9_(Po0q0@5XN6=u+n{!#B--1E86A;<%_);}0C?vLE^uTn`aeL+6AT$kG0 z;_2tjqet`(BdOn{dng3<16*Vd@)^zM@Qk-hinlilEm=jA&(4LWdgNsZZIsn9>D6C5 zI)=VHUo>~%rw_&d?MFZp({gBO-07ngi)%vHY&~8ILUG7ff1Yh?xk8$Xv1+|S7qoK5 zS_9caCK$(8J45(srP>ScL+JA0Um<1af545{Ln(HrM;T~6Z%`7lqSYrsb3>PfF7f}D ziz%JFhpvcF;$oDb6TyLi)bpRK2>`1j*K+Yq$jq}bp_DIqc9z+J6^&HHRs&+WCYTGz zAR^c9TsVVo5JPly1R@t=Fo0p8`=8%-@Y-OY|M6P?4Rg2ym?3Us867kcV%`A`{7&eC zferqJ_bpn)byQbWTF#t_k(*%E)e^S@%V9CLFDfH3M;cLzcXD#RLVTP&$2s=(ZxD*) zkAIK5^zPNGJGjJtmfQa?7vVM3CF~tgah&9q2`Nsp*&~eLk*Jw(%#yo=toqV`iImRt zh3Bsb8O_c<|0RHiR+Q%pye>4E#N>0NZ%h*L1PHb~Ov1^#j{RjaLc;m>_1TH@3Y589 zpT7Tpf8=ul=AVTd$#bl)4h4au&9MnhJP-d~|04LN;_drw&z_(5-hrh4F(0%eSQ+ zu08woWJEHN2Li9|{RY=;j+r3!wZ+Pdc08Cd*clW5=M_^!mT{r>g?5PC!-Kue=yELi z>n<}70fuyqytKBR{%RSCK1@6@UQZD$g5<<}NWctQKdAN9gZFYAoKX{cM%9E~*%BuIUpP!!euV-XQ()tFi6)0Fhjp)EnL3zn^H$iM3x zQa+HYGIKdLR2AtiYtu~7Pw$UUg>hft+H~u=&PJF2@d{ETLgIoE^ElqsGDttdxS4Pz zbH^4A#vjhHuGyjd+_Wd{+8&34J!!BsLd?`1IT_yTXAn(3i$v418lKy7)8ZyArAe@k z$i1fk^^z609_t9wTZzc6Ja_Gn0;;))=HTuGL=(qvcV|nstS&}W#0Ty}aQcA+nfTuj z*J4O)Zot^M=qP8f_c*D4J-DSNagiU=%8X_%axcMwmk|At+mM|j&SD&PE+)pHq#>)Z zZ2FVw?ozO!Le{CFI_uD$!MyBs?m%idq=~c8B;A0>Vs)a^+?`>TSP`TnHH)C-vIEc# z4mMy$PslifZWTR60v7KzF$IwuNF>vLCn{K)UR6*kTb$WVzxRGkjn`1 z@!L(^J1Ll&iO=zpNUwQ1Loi?j?hB-S*SU8#R1$jAlfA)GVp3rouCNN%Cvk6 z2R-K&Q3hfm97U^})$-}zsTJ?TC=6om4#4y zF93JT$uayd7|hh7_??N^OKU8p?d1RK>uy8Cj~JKn&jr&G7u>Sjiecu0H}s(k0Qwh$ zGy^e|bnt>rfE6xZev6}({@~T`=|@XhMsSHfdLq0*B9Erv#V(7M^G(f(8q&8@+`r=( zCO3rR<{_~&noC>)UC&3h@`X*ZxO+o-#O*Gj|8~V9fYyCeik|DVmje_XUk2kLYKMm2n z?R@PVs1Gz@)5lA()4tB=MltdofF@H2#!#oku{jPRdD2}wbt&T8mtNr+0To7`nSLNP z8JRTW{@)>&m^D*MG6O-=j{Qjmi;kMb(#SV+qUv-rg4F!}egbk@exuWK;*RO z(xV&kI-ejj`ioUotpHY0wvc-qOq@Z>-6X)*Ncy{2#!4vpFrt>&%#pN}Q_06uR`vm$ zq#yJpdaL?pOhmW}l2>s2_*jkZ&%i~%G{3;I&jxffmu1IX(Hl)=5z~qbfV;Z7cCKT> zf|zrK;KQ}D>;I$y80ho}KbgJZVfBh+&NXr_ad5_S`)@kUWke9%D7SKDPgXk+^EFPOxp1VtabPk7F2)f=Yd| zBGwfJr=|VG9nMBiipt^_7gjjHS)$Fa&o_B|rsJyn3_1dT8#A~Sq%vYmrR1_Ho3`h`5l2|k!oTtbJYM8n}zo49jw6MGtlN0@7m&;pLE@ zr|g9O%oloV)M-*?L&XjqGFldO?8x%2hY_OI-K@Aj;|nnH!A+JnPcrlF$4j4QdOqiZ zTCHf+9Zg>2)7qxqu82dDcXk-qvi|nJR z<7W$w|ImRoqe0I2IMd2$RZLx6G0HROj=tqq0A?5gWUeFo^p$`vnJ82gSiN~zc1*wC zVh-YbzcdV1`Y)D2N%!JLuM;Qm%8B#pPH*VHD))R=6_E)NzsO3Hw5?wzb1RkUaG`{` zT$+2DPy5Qy+q?9oXo6XKng+jr@og13eE8em8i!RgLTC#rfJ*uDL=fvEiiy_f=dQ)T z596_Yoy;iT=G>prmDFWXV5UgMF+B1&Ea=b`px;NZ8qC;2Ogi#U8_V>SIIuoE{v*t> z$nN*)rvIZFwSY=q7Sg;KJv!A0hbiJ4k9Z=4BYP_UcvvUUa{jZ>z=(%1`LXni*F%R! zjPxr>e(o-&*Tq?}gyZcsKbKyO)>u>uZ@n(ABIEA$TMz+$PEOoMEQ7uv&+daV#x<+~ zsg$$u8hK0B|_b zunP?!F(u$fe81Lcp)B2+O8)5&cQhvqbrH)N#85Xfva=14fg=3>McdO?M`nFYvX9vg zUIKdE;}%O}XNZ+KI98Wcd}Y(2B+Z-mn2D?eXW_%V)-j-Fv6hWxt_1Vy>2Zz>?o_nQ zVFBMgSSmtC=ljG8qSJT}ic14vxaY&ZE(uvtU;d+gIDN!!yiT@R%b&TR07ykkOB|3G zyonOKreJ8l1h%u|A+5qv!twllfW@wKXlfQ15N8wEo3Qi*N)Op|3F-862NgbQEAh(T zd_1#%`pnSX4sd+F-faL`M}GRYCP+HaJa)wk(?S~O=VS#4Rv|iqZvaQWzY+-fF%nr! z@A@0dXRd%45;tgt5Dp(ejH896??_}(_%AI;!4>X}68+KMt`E-NtrV9R@zfi`HRq|! zOrpY--nTnmZ8|_b<#Rz$0NwTI@nbaT=3#7Le_DFVs~4AfivY#Y02*4;q@KHW#w7<> zP|?cFi>v{K&(0LwPr1J@1_j)m^;9Gxk5uA#xL9aq_e|=zQGgZpVjS?i+W5au9B-R0 zl7jxngnN{0zQ0PIRz<<2M~~ zfn+ZFoHL0v>jgBfKZKO#KgH$Su>v!39oBZ!s*sZ5Sw^5W<9Y2(S&x3;4c(>ZfDLc;WBI@Xvw7 zpW9x|iA3|~0L33hR@!C9@q?1M%Tfb0L{@Sj+%oVs(@^}N zuT^?1qF!^P8oPGq?_u8jUQ5N?nR;7=o+_t!tL^ zOnvG0kcHU)k59Mov2p0&X#x)mf?w7QZjC0$qBcoO9N2jwOnk?UCoBtxau;UUj?EWn z08!R}Bc%P;2oP7q>3|Q)*T(W2Sm)1!8--z72)-wVsmekRMQtwvAjOj)oyBR!fkv9| z_wJWqohLl(?ZB@W>Thj5fhEVJ?urx*7w-b`Kf&cr@+w8kk$I*2>8SSl$5>rl1AC@F z;r04Usq_jbF(uhNPVyY*8OyQJvX9=Iov+S);yU&a8M5F`7{6)Kx>iu5eP`#8&VRq7 zTJ@{>H^|g3vt6h{#hJJHTLVe_i7HA;=S^|k<_uG zR&3;lWLm(QNID-n4F%&d;Cvl(ySr~5`|&!0dGVxDpCl9)FmHVV{0zaD{}*`IGkBq& z9`1RDf_QOeDaA-M6qsC8m12~KHKI>R-TL3cF6o__pD;izV_F#j*4>Nx37iRixB{Uh z`g7C|5#BTvzFlvG*#`zc4DIgKbhy}jWwPTHs&+H+fQiYVq-l+4t?r}M4St+TF^^L^ z2EKhw2o)8!XdD=O^^Qs}+wrCD8bOV|ED9RU=pK!DwFy+jW+pS@r&6voz4@}^o@zFw z*WEQjv73swa(d0Ga&Nz@TI3rQ+oO@D_-sAWxqvS@HQ*CuaK|`W!(6dvX-VO3;LpO= ze(Ch>TIUnXc_+TdGTvVk=KL3h6>F8P>z*ZL;9;fO~F615%E|O zb={ykDg0`J<`XUG{XMtg{EIpSv#z7y)cx^JRR-kBu2yRookE+1&bqkv^se>g6T>GG zc=Hmq>xHF9e<0E@G!%0&cV)3J~AI%jC-^F-_pu4tKg<5 zoOzc@9QF>1x7Vk)+HE1z;2mh2_%d02ba#mOhhz8LM6Ls0w9re-ILZ8Vt!#ExR!r^3 zt_x8CBbh1Z&pa=X${rmp>%9=1v8u0bcBo6qtjYK4m{*zKkyU%D1$r&sMDO9hA>Fi* z+>1?;_sbqZluHY<7&wUfye#IWs;M|VxsYq%WdxBwO37_=)~Y&3vqK!!dVc;~jA0&( z4`IK7rHWA}^=!&E3%u{>PhCm-VPVwsx35dODx$099*c}%^c_(t#aaSj278jB>)`o= z9&_+ODMEd1X#aVpafk{^yXU$p?Q&T$`3EF&yKfb{TXS;Kgj8_B#eFYl+s7U}vYTkL zJDX-Hns!my??~UKJ!c{c@86Ra8qms778(8BP98~jq)n4K`5&7 zP<)}3c#LcW#^XvhiD|r#=OTVf+50Mv{`2}pQbk_<-$Sp>?6e)GF5q1+G~t_+4NzXA z<w8_N31*;%Mfwz^@GvpC+EkV^93eMTsed+G4?!>Ifi;11u0f1=r6>MOFP|A#0o=3 z!Di-ZUzvT$d1TEJb<9t=VQx;@p6w)!zCxZld~9Xq_7lri$dKP(xn(EKHZ=w=8aal@YpzHA zId^VnXO)qOP&Blc7IBJ*G>{N57YS16~9KNG< zO(Vjj!2h^*?ONwgzsc>yt|%9mIbbQ>OND;nUnuki{t8m4y_ff?r5X0z;Wu?&9_n26 z(XpqaFJ9p06RG^SR@ZXw8!a22R%%Z8Y^$E-(*M$5Ay*}H%>y1?o4Sk_IpsAKe)gwE zV{WTlSI$vU7e8S1R?f+1k|XYNcT#6=HgxZI4gg z;hDFV=^RjTiJPdvvE9%S`Y2-i17vKWPNUj0I~T98&F%_#1EF4Kq}=3fMV0j~q{i#f8e*me$sA(j0=_$IK0*wv&^3#~VV*EbOBu*IZfh z2U?7f?8nJN9e$yjBb>vXg|(k6annQJ61RioTE?D@rcKV`^d}1?vX3j-syuY&A~_Wm zp9@h21WXJv{iux$Z0b$$ksQ`#EDsTo;NUp;(|By)VTyrzljDSXftk-c(XP@24R~P3 zJhR<$k#C}Syhg{mYIUK=nJ+n7W+U7$UxIbkx5Z#9S%APbP!Q3W3+&yiHv3{fXf~3P zk}S-d-z~&&_6rnx_TAgxsGp2dWSx0U^oRiN%fCI&M0?;X_ z!ZeU7R6upUA(mp_N}(3ycIv^DIWt#z%D-}i3I~XNLnuT+MbYnPVP@vChzr7Kcj1TQ zFS}g<*vYPIci_$A6&STCuw9NH-6$h&aZBIvO;A*z=8dE3EJ7C=Uue2$@5`%@vMCRt znH%!fA93_gWoBNI1vF7920< z^Hw2uuy_}6f(8`MIOBw)-dnwSiVq^3GX*+WE^7)zxX)1ourE7G@fM}^$ovqyC65dN zE?_VgLYs{Abh&$OJm+0&)c5_n+4$m4_d>7zpOZBl|(n z0mW#8Z{H$tCMB{WJ>rYqf!$=IJ6nFc^dEms=lpI}^RT^bG>@py^r>Ra_@tqViH>j6 z)9+3?aV^)}H@1&#b8A9YLh7oA;uoHZyz9KfwDS6|L8nyXK_-8!e;B9`-YdmELr$!= zvG)81=)aSDF!Vby1nRIvm!T*xSikW3TI2|PVyEi!QtY*kR*D6_cKY;b?Ar`leDCT{ z@NWU^7q|U&3Y%A1t-VSi{~_MkU8aga0iU?3$w}ks4e9?hb4199S^c9iGtJRy{e}%! zVcS(bgS|Qr63X4(-8)i*xRsH2 zRCB57{aRmgUuv$j5chKo$u(#dP*Sh@HvPsaIOn|3C*Kd#uwS187?Ogd;;9JAPcCAT~OggU7s6k9q2^;fQ3Nm%bTIU%sl&dv`0QdszD z#1l3p3=$vtSsoHyAFVw_0sFg=6@A}^H19Bl^?sYVm$J(pzH6G24+h7b9j9A!$GU&U zX*g&!AD2HBGx)_md8?DMLA^(<=ACi`?4#z-|nI^cJ zo~AmLlfgvdx(%n67#-Q3Bu?{G+rl?VNZ-YL$^#O89T zd9(OXMQvTDoBoTaw@PBK6<)20++2T?l3FY$58eln&QhrutZ=%d!kpo;gi{8_4@LXh zcP^lVL8qjec60kvTE?lx(ysdefo02hA|a{q0l>U81jZsvZX$%E4>@miezU*TzW z;z2hIDFV4vje&?v);8JqzQDo%YH|7z4X}i>*Uhk zPuQ;g)%+FfH_v2v$@VXMH#<4xojcZF{Y~c#bAjhskIs)zC&EmutesLbTpF2Jsx7MG zor6O}hSQp^4J%}IZ&_{&X=mS6;bv&i-DaYt#i z!^IcQB{|k@SS0nPL&uT?zwwZP_3tBT*o+*9iT5r)RKMAvd*K?RqfsTqAAfre?96UJ zGxQ)K5%P&OX8g}A+CZNSE?gk0*lB8>Iu){U0u#56UO}!ycpS=0KTB@yvM(MlzI613 zkh`f-r|)Np({F2USG~MBTfzKPd`@p$TYP^H_Owi8MK;$ABt-4-E7&vk>&Eo&lJ}pb z^+yv1PbS+R)Gy1i9y(HK98h|%_iO3OND-GY#rHE*=Xdyq7K@9zI4H&*;w&Qv=s=i* z8k*5{+$+JZ$0EFx4yRHOHlG);m;E@vFb^y}XBh8GF2~gV?89FOWVI>hHKl!Z|MAW9 zVx`WViw=z{mHPUxd$2lD)D1gax9;Dcjv3x$A*!I>{(hI1yk&pxu{X#s`Ry#0H}|f; zWNo!C8AkHLJT1YO0v6 zY4vwW{N#wkEnuqv^R+4k0UTOy&bntX^<%xCd#`g^d&pC2Rk{4XUANly00FM4@jN=hzO-nvCioN#&t z`m6Sxap1-pJ#^ugK0taOp`8UM??N^UX^GF(!)_P=7r#)Mc&5?6 z!=oXci zJkF!r>(7~bDUpw1O%B#L;|O{FsX9I~T(S%KG%Uj|PV|9i_x~I@uSpjq`|Y?~y5n#6 zrVl&Pey^E4!6GX)061&7zw)8|XP*{qjg@WZIq+P3^+Vd5RBBQ$c1=~Zjifq~ycZ@3E#^&@(Z z&oAJ0J~8C5Lj8^V4o&L?F!apYu<7nxRrF`f5gkT)B7yD@{9c+(@S^H@etR7kMCBW2 zCk&F=)-Inf!7hopRa7VoaiOmJ9~Zly4AbIWfI-2l9r46mbsG;4vB}1sBw-*|*)h=U zR}s^z#7ACvirppu?Mh?Sq9t3gVnws_ z)KAQSKqnn6V6liWl*7CEgxcun$;$b8D{{|f{g$|d9ABMC#IP?)BcDY33tvM_bv&%~!)?;}6)GD_3So_^yh<~^+*`Zfu3*0yPSGsz}n{8#j6Mz)BnCSE$> zq}k@zs?oES?n12$`&n|sS6;a9@OvmCT=l)-1W<(DL-neu8J!BXjNn4tsyy0)O`%#T zX8yt7mftZt9wZQn=`9B3N8{oyZN}e0$^P`&v&e!L5K8Bi942gJ*Pnf3H)LF|cO!Y> zOu0&_5KT&tyoO-Kf2QD@q@Q5&%fZ0`u|t-`V{9J((A?~T8s^W5*sKmrQe#WnS!5IU_BHwRxPR4ixPVgh{{8zs(c69I4n$Vw`52OfSLU*LU%zT{ z(_?4%i&fpk>KfBTD6|%`mDuHF+Hv*>dz1Vf3asz?4JJ>VSxkg-(#xELmdB_=^XafL zG(>grkSqcI2BL&cc?As(vFPx$9OgfP)ogbkanW%csy9Q4QyMrS-=u1Ou-27K|)!?v5XAZHPl>-KbmIVc*v}jD6KAJVNh*NfU#2{wR*I-JBb#sR z|A(Iw@oQ4q`7Jx-wCmXU`S~Rz^zNB}f(UB-I*#+)r<81`2xco`E4vlGViVeXMDmjmDIOYNI}lPIdT0hcUXyl|9OpE#L2* zziHUyNHKgYStH?%VOgxn^Wz&?nONdXd_8kR#AOop_ypVwPLqEw?y*{#j2M)v(;EIH z)6SRDvj~A93kZ+j%fnnST@7=d?2Zi7Q!yc?{PTw(m~kAtoxyiHzo7|>ldQ*Zaf0wtNZ2K#jA3yVi51B1^O6ixVKf_X^%?+8T4JKZ?@IRi9ZR~j#R4;LJ-Nu0 zAFDTQBDTpoK`j)2Up*hhvgjqeqa_?bsMv#BluA-k8Z762A9Z+lL((z#t5WleJ=^^1 zvaN^g`*{;)tiC^JD%j;=+nZ|9!_3%K7MNQvC{*>5-0Hc~?c2Aj1dU&XlRv#$Byf{r z?O;dQ*Q}BA_G4#dXJ`A`TkQ*YBvxlQ6lq!wKck>!jA>5v4COeIcgz4IAfP>7LJ6j( zpPzmpmPkWK0r7W<%4NI3jyJhc}VkC>$wD556JX6Vr=yL zYma|eBj@=xahb)(4 zpJv*VGpf2!^xfWAI-D?LIZq}<$Zc-%`JOv~uc)&3lDG38T& z&`jHqQrd-yiNfY7z(JCX&eH?=PPubQdw!H?BkoMUP3}BG_seRclHwhCXo704>Q}v+ z5jo#pLlYMo`@}?eCjI4?IKKYIk*H9$-;Zy76O6j+lB*Pmrp|x(Q}>tfl5b|36gOy? zSj8OO>r=?@e0OSkRi=<>W4^D$XrG9PS-1NEi&V2qBZJ##ek2P`>)U1f;rpV)!o;Cu z>&;jYHflPstpJeYhHX?E|ZM?QjQayfxyWxg|5MEW`%e)buJM_oXA9`w1p(i}&`%?@Phjs^ zQqDM>M?pbuuF>NERU}lxuC1g_m3}M6aXDgv79*b>r4^jgRMeYetlUFF9*;zDfS$)Z z*4?L=Abk7YjKzrS^6<=v$|vVu`8rphU(a>2@*SKs*QY!G>_LA!)<<{$%wIFxkUE$p zoNudb{2|Yn=8&8`JZay0%Kv_g)Q^iXk9MBS>aY?z&%~wiwz_(E+vtnLc3B(Bx%CW} zbu74Esp&h4Sh@MFr8q?Sur!^8mG$TM?*YC+3Z@6aI_Wg2%gV}9sk|XfiuB%OC{_ql zQi<`F0{ zrjw4V$~`0vbR+8p+d0dbYjRDduW3$q^>IzypWYBrV9=8I>iEg5N5_&^9XVbM6$#H7 zgL6aWL8^oAeD?Mg+fxta&hqNdy5AJb3_98Zc0})*GyqX0YsI*Nrl6A9d0c+Qy2T0i zrN7*ObDX^Be?(?RlO9q`wKwV;*R6Yh=*Y?e20^@L?G~aaI7_&r3KfE`gYQ)j^HZaL zDP0@VpMF?1NE6#h7L|GId7_``+5754q{eI~ z`SeDt>INBA_a%i$yg8!yaCRcFL{;Ui!JETFH3?{;fBS`hh8#%fY5Xan6|3SSc94J&wOcDuYAIT~0?u=LdC>eBnR9vd zw6XLpvG@98AC71z8!5EzR0!NXH|xp%3r=`!IvC1mosNA|_Bt3QHF z5pto%3B_`bD)p|C!2TC$O_SYGVZ(b$Vkswim&TC>4AB}d9;t220=_K*t%vM_!f2tE z2X=)f(q;IwY>&ChPaYvkCCQoB#s3oc{ym@!M38s9906wn&fKOOq;M9{oEys}5i47W zYqQ>s2YsAYgr^iRMhsca52*LruoftV`f)~eZleS99kkW9$q}F?kN&G-!Rqsj=&UOCF=k%A{K~=RW?y6WR&4YJIM$e9x0RE# zpswzKhwZ0BH||KNzdAb7)vM<4{77u`dcLiz8ne%ieQG)DF*iFlU3cqS+S#*bPw6(I z2~mwd++D2hG}9}zQziZ|jqGgygv*wT%_{et94)(cDJ%b6BlbKmo{OzKRQR*y8pvU9 z-MaNr?6g2*C;bzZL+sxGJNlVwU|$7T)?0%pH!bpu+>Y_`LxPW|SLMnN2II^LgLfy0 z^4RSo!#K#{-%`#Kj;aw(nNIU&cd>? z-N$+Jym?iJY7enPC7KMG;HkokKc1E}{u`s`?<} zD=8?uSf=OYlN%aE5G6h*OMT)( zCH>#K%euk90Rdy1QE5;P+-^`|2R2L}ELW$~X(NlTx={PVw zgabi?IaVYRTrG$*Ec(Bkp8os2CK6zTUmi>f!E(L0e`2aXw;8)EmatNRCQpAwnZ~*8rqy!5Io>*|nU+xSLw{5Ud$!80gPprMnW!>^}|T{O7_{ z4Wv^}TPEo7Y5UQ^qDc*B8zAHMG0V53ULRF=ca(>WYmqMYGM1^mK_MI{KeA;BLN zX!ZE?fI*7Aog7VU@Y_2@XDBe0nrii6gLEzjQUpSZp18G)RBgIFpas-L*ob2Y_BFFG zQ?c{+Gcb35EaCd|p+8Y|9y+u9pU@eBqCf^jK}7{tg0$poaL5Si!|#u5OJ}W=6KBnM z>U|I{yy0~zEv;IUFef5@K#1*e{dz=)FXpC1(Sn<+TKSMv$+$|zQs_C2s83AELs>N< zSa|+qhekK5?mvj^e34T7*fq=Sgv*F$LPA30akpjvq_$JD`{a`;iH|%$^q=@1JZTL| zfi6Nelpjmu3ju%P#rCutC*g1Bkin>EuES5MgCQH$0o*F(W+}FO#o^PP)bmeX`sv;~ z2hc1+FO0f;*cpmp|J(u|{hY<-B_j`{kr7=IxPTD~0X024BeY!i+PpRE;<22+D1ytt zoyU#@d(2mEQo@gUX5ihbrIS*)4NZ`t@2W?puz(ezDHODW^cMxd^L6`HF$;76psM0K zLL^?2H?8m%%;E|0DL7WW?brzG_x1H%SVJ$wr}b_q-h`odL3hT$!0>RP&?B%@jLDMO z=ojG9KXpK&lN%@}R1_4<1rvCyU~SAIoS+h0xH|i@@a$I3F$f+cUoFSyhIJ!_lG}($ zDbV;;65|HwrZp0-00{sSQ%n5t8d7|?nnVVpB4Vx9X}ZBj0eN{+^Y^T_iO-5rB>zQnlOFQUQfsijhV0*1jS)?nt9z|sbKj-63ZIV zK}_2X!|Cb45H^79IMW^H8SmujyUc*Gic?lx%3eB8i~Va^ zFk^+L;K0yOa@}PC{(r|ubB!ku`dBKQ(ip3j-PykZ6*pQ;2U4@=dKQ++Ewy2M$iCqpZH;i?MaF5<#INGA|J9xkqQCxTh6yA;a9PK}u9_*I3d@(HvFWb&&O&rt}{&# zJUga=#+*&ZO3$=Fl)x|BEF;yk&FJ`Y?K-LcD_CWKQzBOvL4 zL#B`_A_0Tmi8PN`5&ri{yA-x-{ghj}Z*tOZ2nQuEP*k~Mq7eI~Sh^rUCsIW?hWMc^ z17w2%go{!4m%i$_ub?20EGY?VAGqonv%9<^a?f#k@BIJqWRd+=ty~$9P>!GzSpW+Y zW867ZMYv8nr0(bvKQh#&zU4+Dc*qO{miil)nije30$hs<$aH0~nwE@zgfbW^PG zeV_}OYI4qk!iIR*4`^XUS^zNDWzM;}7mg8`FIW2TKce14G7y(RQ}eU6zK2*Zsg67R_T4*r`RrwjZP4g0v}ZHUHg`8I zBM(hkf9D3M}ud~sv4MkKb`e}<$E(*ZMS@t>rY}=(q)2A z{<{7?Y36FENVH4<4~#LbKp0mc`~CYj46LYWX<7E1xlX_WuGdz``JX@_ACVUyYp@z* zasLB%v8*G=F|F`Xx)#PDO)8caV>tOV+Rbi^s@E?%C!$&d>#%= z5PvpF9xQ~bU-Xv9X6nyxgeS7sV4S7+tt~P(Z*~hBwYuBvd-FDF!``#PN zxL5N~thjjbqOlV!Gv{v|#-PWr(Bh*1=#S6RVHg!{-JNzc((6R|T2#f$si9VXjcok+ z^Jg>O!C=VOu_7yGVJiUs_1fooNQ5J96O)rYng7L}d^pLLEn9{eP5_k4@1-7JB1{EF zw3U^W;gP6DvIhJ6HyPz-zZ1hK-%!eVsp}J~x}|}&`gM$il5=7TWo2X@D>jr)7F$xm zb^o0JwOJq)ec6viKQV)^WXK0;Sr0n#R?N15Sn^73Bb53Y#NEqlqUc~_Px8tKYb}n` z7iv^E-4VN&mH>}DV~SycYmnV^!UWRTqd1RLAChu);`wE`&n+zmF?(xK>pv<4jsUeg zQXCNACE`OLHis|IoE_hQ(+;t`NgF3cg|S~2R=IFXRsm=Jns*5dO?U0=?;%#L8qr_v^tk7uMYpuXbfHem+e3~Te0!Wb zu?i*3tFJh=#Zr#cjw17cVBH>7@Q~=mJ!tXqZ>-&y{2YllWgVr~?n6G(d-m)xc2d3X z?d2u$I6Y{j3~p>jjmy=q8`S#S#ylEXFap7qVct4G;D0YFiOMn%2wYeLH$gg@@3kD( z5vz~Hd_mroWu6Yycdo+{LWdBFvkNFUyC3Z~7X^O5c2Xtx=af~Rl3LH}GY>KvaxJ&E z)NW%J6Fa3r5h$y_bWt*4hMM zxhEmK%)$#B-S1pLDyje7CK4Hd7E<>usH(W>Avb!#3#IT8j4z;03j1?{))e~s_!#JK zO|AQ^5vxW_BLiAfE`B{d-ElxzRE_c`!32CL)Y)G5h4Z9{y}kXB!-pf+4TM_w)6BC&H81ZV--oZhqXEj|3d$cYpkUxh%k6jsF^Cb$}HSyZd zRu=V^FGP(&;VY!1`p-}Gr|XacUV!c^74`_)jE-2xHE9?*Dy2cehFv72&OVP92|4kO zx935Ec{sX*J&x+Z(T38Sr~$0G4)Lq8N8jxz6M4?-vgJ~QO;Mnuw*uo)stPL`bsxh zb1%=BnVZMQ#px)9ox*^QOE2llwLl%iv4ILchoORzW|z91y1dg4yxiO>#y`9Lw_4h)a-a<<{4_oj^pUN^f{=o&}O4z*}o?-Uj`{@(FhA?J}* zzuxPkJ^7;j^!@_Zz^6Pu*OAMZ5-X{1(pt zsS|6pGp@wI+!Q4DKg8Vs#)J7MfG=MKL-+ZhAWUow$i#*_1O>Yjn!q-e&XNvXsq+FP zSqr}JCv%yZ_$aDdFC%m0(Q*0pdkH41rP@*2rtXG|vI7>z_qOXp9AM~WNsWW%gC5$~ zGq}#sq9Hw8lQmkadfP+mW4g08Z8lg6l>7_LlEO*1Cj!8SgnQ-Y%_9C<_KhhT-T0+T ziE0~RhogSa24@?EKSr+VOO(aG-MfEW$W3}uo2lziccWVpH2Jhis17k7yGqXhC+*K3 z3tf7n$;2D&lYN)BNw%fOzhuV2<|yP?&nMMB%oS5gu#TM=6r!B_778c93AXD&5rM8s zZ5rW+DB_qv#8tjQrJ*T9c(xMgcY|PJ6AI>w03rmkW=MFF5UK0W8 z`BW=6MJv5=nVZEF1wG=MrT9V#Kr%6Z)eVjVkMcCFo`Vl2KA{~`z2ehj@ockh&7ysu zezj`9*7EH5b_s-2U#3FL1oGdP#cQjnJ&0VHZ^q!h>9&`baBuK!N1zOK_=}+- zE*|iRyw3V^YxxyHv4z~kyn|TR_(oj>^hOMVFr@}e1|)JG>oUxeLRe6n1vPFMc=4z4 zI4ha|Up#`2B|Hx!v5wwkbb3f*G2Jk_RLzmlfgu4bMV1n(&f1pyhjpe7Z#pf;tDT(j z{$r#e=jB(g?2~!4Lqc>zM(%2x>nnY^IDPr?=`AB0_d0J)l&!O5Jk{7>?XBI`>)hg_~el(=9!!Sf6gXnJPmn!9_b%d8zKIpsYk5=@NuyMvU;Lltk^E1kW`Mekq(3n+f!?Pdbm7$IH~g0WzCw`kVD|*tG&F6qSj;+NN=Z&Gmz@v0q=l&| zURo+Un&{n*Gzgygc9{XfVqU%ouK zuCw1gRPVs>T@EkrJ6{Exnhwmv+K8S9)}{fJ2tH2q+{D+AhBE?x@578~|Mx$k4@#}q ztr6kZp*%RK;>RAh1V*8-i*Mrg|N5%%g^FF8NZL`>S(m>!$KO*JLot5lJ+;#YWVsL& zVt??0DMlYA^-ZES$L{H`->Xa*)&C2Fs(mg6xC7dT^?Sf=eZ8?2r6a(P&fesxQ2V1K z1PYKd2v*uIFGIV@VI;CyxQj|p>C+jCRqbv2$$~WV2DRoep1R=Qg5oyd!)jk%PlFhx zTX*liuHR!V&lMvQm>y(}JNTNeHj&P~9#fMzZT*GlJ<7|TcGPI%D^Tnijqb1dE*E0< zwrIp;yX4$$4xs2@I!Vxq8FEJJ&*NHF|3O;6l7)wdPmXi_4+84yL~UXT@Ad1~P_v^q>-;YIuCk)c!gfcB2IA;Vgsn&ZbVj_7f~Yecc0 zYt`i?robJn-HhX=RU@9Lwsk`rTGB_0C(nS13^PZ10C`83U-Ym4(Di|xRl0@P2wVH= znM5E$B*Yaee@@VtlObB(~&%q~bxjKx_ZCbcK`G;ht zLmhI!4FtvD*yb&LtW0=sMSOlM_d>gsE2Yi$h@wa7X1K|v$FO=s%H zP0Mh@PsuL*`X^4D2%se7ggQ-1acB1{}*>a%a!pJ(o-Pqe&OzoVeWv#WK9{%im`r^v4KMU zCzn9^h?CS$n^ib2mTeh5ipH|VfUL<=M}W^KwTdM=cKN$lYS@S?>G;HXenW`9F!CD6 z-OablQ|R0ydGfKm%}myJ{(z@$RfbzkvQ8wp=?-I!wiisI?~*U0T1_RLyFFCBD~;84I@~{ zJcHym1^U$-30RaB3B8NX^O_7@kKcY>`#bi2u~8`x;M>4xr}=p3LtwCrgiBEUars2^ z&so2>&Q6`LxjRv^F{@#^ASF(GdA(eWRHfj!gpO6)>j|0LjQcj-zJ2#pwt4HL&N~r2 z`IicNUW%2?X*3_?6EZa&IX5&(@YGI^RJ8Aqy&E=ZA9XQcs5E$@Zdj=x+DgME=hIt% z$ynWFc5ZIViM9!SzDEwfn}>`qZ0r9On*Ec-ExU=O?XjSyK$$>!xVEZ9D5QTzIhZSA z#1&-nt8^WNUmzd&y`6^&I+YC#4J-A!0|7xRqpjX8SBNV%sry)P{u+t-=0W^M4=B(9 zbVdmn63dFLy2l`ucdP}^9vr{@F0;8VzcJA;gZa|-qYwCd4=q;P1@iZfICn$_9ocj6Cjl^sMW0KvO=d3i;LI^)DFTP2YWDnS z{AnF0L}@v3ZOSI|O(sjh?{78q&u->G6-6N}5!Cq3rQapN3q7X<8xtGCCu?;)5xxv){Vie3}AqTE1Lchet zXQZ8Vbx-b&U8qA;c3eVFf}udS5G7c$D9oTsvJM?yfPC{18b|NId*Slr#6;cTIcAps z1onq3USDh!*}q>aN8-!Z-Gs&90)hbY#9mT|v!srlSi9Os^HYCwW=Y@M6#c7QN=;Ou z+jTPxenD5Ik{18*=1B8aF5%TAFMtfgGGP$Yl1u7NI)Bb$4!g%TE9aMO(`i|*c!9HQ zJn8a~aH=qC!8Ig(9-pn3v28M>j=gi)eszy03D*rgv0KrYfSV44n`2WzQhg(sIHnBl zdOqyQuNIMVsUE_Y%IJ-HhDrHmYs63C@u?|=NVf+5`PS>|Hn;S2Y_8gWb>K;pf*IY_ zAIeK*%W3lA9jgpga&mH33JP)lYewWMWxN(1(n-*du3pVjow+zP)7@z(=)n2SzzgW&i>7PQavKbg-f>QvhQ(@a_pnt|_^x>P@VMt(?zt30=rUBQ$F#Fw zt&`}HC0$qT2%MO_(cY6Qg@xq zx_)eWvU-=pEKS{TyNDCrU`TPrSQEEe`oq&5H&tH|L@F^~*R1%Hsn8&w{K>d>t=|=9 z@{~31W4a&il0Gk8BD^$X&2_wMYL+MYDEjw|7w-> zdlMVD{dJ<}9}XAOIc$eRf$}mf&|)hVbUiU4c_8MUCZ{u^T(@4NbVtcy62h~Hf(t~s z1_yt`=ONzX>9^$DiG_2R))WPv!?r#ZQZFrV1gem`m%no}Z%Z`zwbzcb$?_E$1?@hw znv8S5eNWuKFFV^bHpLg5^d_U`e1#Uh%-}Q0t4b~@xmABbp#3GepIv?1lZ^`OE~eiy zetXEDl}Nud8xpF|Z<`TbYA_ft6Py2%^_ysJjoMG>ta)_O)PHi)*9*rubyO_#@2^a5 zW?3d)busqZGmlk@8-(6h=@iHP3D5$?HBKlWU06e`M+g?cdxRci{ z#sPt3R~S%&`;K+Kg3i-d8xO9(PQPju8>T1oQ`-mBg`#(Jr+$f6TcYAsx%$Lrv5Aoa z8;elc-bK1P1-*ORdJ`bF+Gd#D)=h4_@&y%H1vs4l(j^ zCZTa7_9c^Zp`vk*em17GPp0g8W7(z7%tBY-w3lC4_y}9=w;9SY=J9YfjUVy~Av^SE zwP`gRE9Nd}#~ox9XLLT(7@P3T`o~iNq8GnB(l&Y<84VsIo@@MH!T)rl455V$rce130Q@ ztV3SCdNrr%dg(-hluv}w7z9t4tnhO6woV75?k3jwMm9+?Zr-3F)y6 z+A3bgYSr8gwVod;XA+JVDVUH>Lf><_!_FYl1mxOdpF@n@s(QMn&)n$@v zb-Lrvb6Wh9!Rl!A%z(`tol>oPHLTB}(}aHg;K76NQanMr?UhQn?Ze5PrAkNg#4|s- zt||~4t~XKg@L-jU-Pc>!<>a(4C>^Ds5rZq3^VQ#L%6hp|^9?#H6NXQKopsIc!Ra6L zJg+aHZ_P2JAgy#6(?aDFZ3PLB?Um5k0?=(lO7FP`FSPz zB0-{mz8P7!^M5rXAXUSX)1oe^6V3B%+a12DD+FyC$RJ#che7Bg^;mK{2xjJrfG-kz z+$QORuXYXF{pftQWvfos;AEX()a(JP9$c~s+I9B7CPY^KG%{`Re}_69EM8sZpU<8? zQ!_t*P_3h}*|{#-J677Dyl-?+zXu1fI28n91@NlrEMUOQfy0Yd)!@$4ei62+P=I&)Y%|)$zEe&A+ zidgCOYlD^H`uU?IOpM*jocGf4_>;|!Y+0&1f>E(UcejaWOUraXz^6BkQcPWRY5i(#)fxhu;ny(rK zu0iH893GIMroMQpV8w!t#gbM^I5`d*YO7T|o@la1yI)7?Igj%nAx`vi7sF^(`OA;P zd!+Q&99-F18l)=3NrW{JgBLMPl>pEI;?`ot$7eV{IBSri)qF(*coPTJ6U5p9Up{9ASCQJShe}=83gfg!~YI3I^}dBrTsQ+=>sLl(*a*yt*)$FE=@P zsD0IToa$r~#mAx!lMRwl<)I>W+|%h|@h&s<)gL9T3c^~^QBTU;S+_g9f59mO*iGpB z7eMDUJh0y+@HO>8KDmm(qp$66${sC#D$WE1O^+aLpiS#p3sbhTg<(J`Ieetg!H^6K z(9G6u-mC}i9Y}fd6z-&<#Qf>>sUx5sJ%t9IwYgYhYrNJV=ii4H)Y*ye!~l@ZaUdUD z-Rj}?_I3;mcj`aAw_(qrrO39Hh-}yI`_dR7nnuU6xs<$O3lj|$Re2k!;IMqr9Qfb%F(Vzve_kXi5u#``|<%? z2DMIe2Jw7Z^jC6QM}vwuB60f?%a_FBAebQ7Ccv&AW=#zB3NhV$_x3F|)|CeKFHZIH z&VmFw@X+;2+HSXaUhnDL*J4k9&(wDg87oRyahIlY`W<=ZdDpQ;M&sheEG3b9h6f0> ziNQOvP){A~hAAt!`mNW% z*;+h`oJumMB;VH!+ythEvJMa-LHy+sp2jL5R_Eu(NE?j*Rg%yD&v+VQTo+2^Q=N+< zA#NkshKJ{)kFP}y38{tx;cACT5?j|9xEah)Z2qZRI2Sr)q8!FMH9G76)@`DrxLqGo zm9mpcvscH1R9m_FMht`HS#L=E#jRmYPq28|ztD6jHV}}wy3~60s<@TD1)##F*6*p7 zFWFw2k@@D0e5;$@^(P!<}w&P^wSHL7A-k$(9sDU_k; zdcYj<$ji%y@6xZnZp8%j>y;GAL4|84kaf9FWS9q8GIFcMo`e(&f@~wZV=na(pN+H4 zclY=7tp6>URuzqrLKLwkmk28H1T}?8u_1o5e+Pof?ib`&f{^7`R83_@Q z$t%b$Ut!mq^cR5;r1W-l7}f-W1XO#+H8akM*zR@7aN?;TzbgkNvvgNU!^2 z^I&v=%KuzYBOue={*)?~dG4FNCNB%}E&THT^}2nTe5$>{tevdUu_%MzPD&OSzF-F# zs&@cdTnbGw#O_?{i^E9{P}SGkHuw0vF31y&Qfg$=(o9c;YO>!mn7vl7FU0SJQ z0CIjyoiXqWUP4<5dHgUVGXy`D>$dT|< z&+l)V>A#*%C`rAwuE-!;sS=~Q+j?Z$ThIQJ1m?b-8J z>vJY{B}64+-ab%;Dp^cdAO9Yy0b%r-#PQ zt6u}e{bJ5Kt$zE$hk(h8;fSjW+KeLjp7W+q*;BaZ?6M=`cS#RT604+hL%aiGNj&d0 z1NlKqYJyZH<|PF~TlZ_%A}bAWw|m4Og-C)ag>Gglk}z0_EW+kjhsySiaP7r{N@UnS zeggmNYA!)WN)?axgNXg+bIr{w+tkH_j%^y8;+y=8n)3^6Hs-%A3a!DG z9CL7=UfWx1+sY+~B=X!O$T4N}Gz(8?ge7r&*LY?+y!CTLIp=ehrD*&SEM5uWC) zbfBIA1TNmj%d4nox(!S|Hi=V9rH3bmJdk$Yd$!`RtSrm(j@DMA!iPT#K7QD(Gkv4= zoO$2!J1iT#BJ#oFd|45TMHjolH8Mp- z#TzKDUdbFe(xXl#-yY$j&M&HdbP!}f6AX|Fh<({r9P|7@Z0#mUvk!12};sRUM{`#0|UF8caQ%} zb=rFAZ8V3O904pQ&IT92s`IWkzf});jETBzw&m5_ibPM+fzMN<0GEze$*kB|cow23 zaDn*aZ$F_~Q%X?AL@)b&lrPJGrnEes?BMOy5aVAUC}W`Y!vPt#ZyPp5m)iod0osuC z^cGRH@R9=@+X9};^pN+)7g)eswc!D``q3qt?f;5*{suw&{lz zm8)DgNMVv+hzWGa<{PAH%hr)0S^MLm0!9$}W}(5`T@e|TJS6Qb6AvYMU(8xHKE}f` zvt#b^{;wZD`N{IQtNfAWNbz=|i2Kr$jJzsYM^jAHI876s8)Gdrc|Q;RT#k|@vzHW! zHmQJ%5T_jOUR_tD&w)lcwkI#qVd(pDA^~KDq+|J8MGtcaLyzI@)g3;dj#qnaW4N@< z&DHfU+yu#zEtF<@@VW=bNG)FL1aS|X+%Q7`whfY*gqn8yc#w(j6jMmkLlUEXlNMng zoYZt*A8RhZf6o&nc||TX-1Eizzdf^6pSo46$LeMrY-6-T_S;;YzIBl8-s8TGjtF1-OThPv*u4KAU)LSb^V+_T zb{TCW(bmvXNVKie)YKF$g(A|{LZp&XNfQk%r8Kq7hA1ixEfkfIG_-%$r$L=M4$o$IT%W>7p{7DUfTG5jUv5wM)W1Gyd&3 zLWC;6l|tiS3{_Q8fKQ!Z!*5r2v^D=b1_VNin!W&W9ANIe#F;9{z3yO~R-WU?w=cJ9 zPIU)o1WvohDc79cat7c9`6`}^Ck=`5}v$&{nTUh>-4EFesKdI_Ub58M@2`M zt4D0iezPDPVmH+R8MD~@z2)#7Zi3{NaV}`{n~hV|YQckUyWXFHO?kF3F=URDY$v&Q z6`FMAXb*Od;^V?&h_K!~MJG={5mF?rYX2B}6cIIk~5{p&k`^|2X+B1dAgV%E|z`V(H z14^wBtk}KL)0NdQL^7VNxj?(XP-5K&RxfVogt8t8+a=l$O0fY%Kwi8&>>>xDRhycS z&17gq+KtK&z|U0to~I7_0f|UW03g5Sj^&m z#nT^F8`7gi*B&3~UvDNG*_J=H(Q^5`2EtSB#?k{ZA0>7i2eBxDN*<}Ye@lR$pBPbK zA3N4B3(Ds~5fKr09x~}%CT;>W0(k>+Ux0nQdHYC7Ns;#472X7vBh&}*J5E87mb*a| z6IH{0=$J844gJETpd20}WoFxD zwzI=HukUk)%oQ1DK?&Zy83#%fo_g$RtxbFkl-xZwo~I`=#?7Ur^o*_6Rk4J}U6-UI zxaD{IKVojM>5*QxjB4{2Q-k89iN_Lowi(`yn{Dq_1)l#%FVPbK|LC_T*~%+&B<{OS zJU-sxk$v<>;&RtpXQHNi&rPQtl9QABvVGXrx~@0oEA5v2#7kxlobw4)XE#ark1(7U z2E%pN&c!9li@qO!+mUXNAMbRmeGq;^+tYQj%$2yPAD(0(;}Xv1*_N(I=jBG;l~k6m zJ&qTU5{;QN4R9bt)k0L8@$)@P$dX6%F;e7557U{&jS3 zC4cnjQM>!7afF&Pa^MudYP%Wu^;NkNV$OJPrn&e^=fyDo;ps)AKslU%$ajz3KVdiO zoAg58NiuzZo+6o;xY+czABOg?$D*Q^-n^NxQE#erU+C1#aN(NXUXQ4;Ej*HA@1M42 zX1xnuMQ>23a`mU{nEXLk>v8V_k=4=HYFY>1TZ()WZJPsC%A~!jt5wWM$jYth{Lh`c ze}D6+!I_M($>uE|rA4z=u%2Ro`*Q2i2!p44P2p5fBQud^U4&uH*SEfSo5UqP9HH}T z;d;Fngu(E62%;R+_pYZh{d)WTC)zmUUZgy>~SuxJWcwL9-u%zb`vR z@pP+U;s`MQf4gRM`%4{6?z%vUp!&9`DPuuQ&d>#|0c};4CpqcXeJ3Oa>hr-lDs? zXYbzUjqsl|z{((AJ(Jav*US8VG+Qm6ifWW4S5TAy;;HVI03}-gj6iB?NCb76+b)-X z6artu-auMZwAO}&?2@vY0=zBi{K%$M;y_e?mY^wZ(H-sBzDaDkx$Sn@wIYvyyIc&& z63edF#}B;=-{h;%`mT1SdTo_N zO^sNXD3CUOr|gA9$1x)TsFBqh=(++jp(DE{x%_Lvr}gvSHn%%GSJ6~h={2MRI%>?Q zValoU`S&#Jd@c<%C*WCv+07m(JyS|r?-hT^eY;Zd0E|^k7p`84xcZn+(fb!){pN+G z;;X)IU$C;C&B-Wcc`?egzCwC+Rn^dBSqrR`WKABkcG4szDm;;#6vfu1b4sTz#I6A6p(9|c+k3C*$}i@RIYoNLSe9v1w4XWCLpzrsEYxuSkd$nM_*OH5 z*=_RY_i?)Q9+M~W@8EN~1c?#9DtD5R{Y6}U^vP!n9y$1-r{2s+}>^iHlmk zMlR^$J(g9g^fDiZP*_Nw?g^^1#IdZZM3tydZ@yV=BZHuV7hDydC%i^0yw80#q(DXj z#`!01u8&_x<%o|Br|SCnG1#(`=aaYXuZT2%shu7qh&8ET&w>Ev;!?=crYus-omXS7 zJAe*EGCMi3{=E+K$lZ{?x4muI%7tBkBRzIi5H*>(^_#Sax~t|E7Lpq{dZ?+z+`a1j zzLHzmobULtGPMt7?3wM$z4xgF^{z|nV-S(@qUYM9nCN9B5X#Ena1H&5%7oY0LYPde ziru5pNdlo?>i*l$aEdN1vS67%qnF683}Pj%+^j)*46_jg=IvJhs2_z7#vz4&_9Yu$ z&na)9I|aN&Cq4ObTF|DJ8}&(GfxBsFwOm{u-_#7 zR8VI;)%599LXv%kg13S}&HLpU3HNS!P#vy_#G5;<-%RI;){Ro5pTmK}` z=sVX)wHUm+c%M9bW}tIgWPNO2MYm*911Sj!YY5%YIkH4PhdxX3rZ{<0t~yiL8>UnN zVtjkwI{R327R(;CiTA-y*Z@b0TLr$VN50QfM_^Xuuf+o9myu`3(jIGy!3<9vuj5le zYuJBX%ECoCnZ)7az%7~%QJ0a#OlOCo>gsCB^-gk@cU>|aH6^8!2Cs@uZEe3s6g^FGL7QbbjkHopG#&0f z2O4;UNuZM^+$7hS0*FfS8G>D)a43fV8PjU+>9i+-HpV>b}E>H zd1#Ybq^}V=7smhGD&)ejs1z)4-wHiSu|>fP`gQA$tF)w*&jsC|_x`b_IO}G=hGp>+ zV9J!!(5MY{zO3`0va)i|dhdNwqNbWy>_5yIF`PfW{`*#(4CFR2geG7V zxGrVjrAx2G=~R@cKu{qLFy1a9xb>VgD2CO(0JVH9g?uO${6_!6f_Afr-hmi0Oxj? zKntI-WL*4s=Ux!jD4@cNu=@1(Z@p*cYcqiQ-nJnnwCGMMLH|=+Ld2}HgXH^qM2-$M z%DY$^VNc0mknhPf*9Wpv$6qScu!i48oU|e{na`EVX_hVxS;g%7F`t!uKH~X>%AGlr zdzqW8x$7M*2UzO5b?mE8%DUE-#yeL;Q4mJq)5g5cT=vOBaO7z4xkjJKKY*jK!r{ym zZyo|2$TOi4dyeIn*)D9?g*~!(p~=zHV8PmfB$~KF5Vk9|aq@}msk47^b^~&xvu706 z{9#@GlUQRK0Yy)3mhtoRi!JfKhbWvPc!_2`$$hMCc^jwjbW2=*(8i z&2cAP6kN8eLBKZnguddk`YN;PphZrO)B*m4P9yVY0FgJ!EoIN=Z9w@Sz$ zo|2!dQ-NJr7+8jF>Ez($7G7wj?U!{a6!~c^Y>dU<`s-&!2TdQHqqd(A6cZEU;@UZU z2IsT=go-73_zw1kWfO}yJCKxJd0=snj>}(KG4ay^1O6^)U4*~>bp~1$N$T;VjnTO$ zlI1^~(9OZL`-DNdA$@vn$<3SVQ*3WOvrr)YpqU(h{0=EGZoxkJG+qL8t+7o z$z=RqzD4QSqC;wa)yr_R7m$IX5UfI4DXh^pjgS`b)+*RHZoU|r-08WQbY|NXf09Kh z2N@mTKVF$d31FG(9)J4u=^93+>nP}S;lyI4^QzeCaJxOCUus{lb)40@9*bseLQ?5> z<~=}D39;ow^I>F$Fy6~Oj?t8aH42g7oc4koq;$01)x{h&jFDlZ{QtUgPZ64PzP^Cs zP6GCZ!_fYzaEZ&qz=^hRcR{YVux@CHb>kCKLAQ9_{Ii6XTD!`SMZ&g!$VpOCau0*| zhs{)br1<$Ot#{}7?8yeO>e!-y2Jxmf-rpOponucW5-5ZHeJ!7I=p62L(dxOAj{5EN&YMTe1Rqc!1xbH+uiFsXk(LNw%$7?Bq%i0f~W{AEfZB_ z*s*GeoKcEQnp!@T=Y?FK8k#}Ep;gN=QKF?JqggL6917K(0Q{LsS!NJ%Dj{sY?j05HRe) zTXzYj&|E^jgpHZ~McH0;*0x2(CtlU0Mx18*>pT0olFULX$^0wnSd(@IUF_mkeB)W? zXG@u&s&Aw{1H8=O{t?JHI1XAICfl;ydy|p^k|5)Hi;z;9>PU$U$SqJ%8o;)^H1O)l zzir$9cuvF|ks@zYh<6X%Q?LeZo`}bS7A6bkp)LfG-3TIX-_u@M{Sidsw$YH0k!e3S zOrwYm3rifNR-(hSYw_lxf-Q zvHV}lllWPMUMHOGK{)-2i84ddl*?JTv%PC>Y*Ah~PlSkz`t0_~@Q;i7XNmHX2Z9#2UzKo2^mIw`+?e>s_r}iO9*KwwDVesA`mb@-afZ<6*Y$w)QwTP+`nncoopmX%AZ$MfR~GjaJwO_f!6#R+lRwU z|2i2^#X%#uS!_Az)M^+W?v;i#Af`uX9gAa^hPVcvy1Bmgkq+Xq6l=LQ4FEfJ zz-HH)FRx`dcE?#P#G)D2S3Cb&h$76GV|=d2ZMUUo zw3KsJ`%|Rew9hB&ngj<&B4pH-*~0fFY`l{r_E z*)!}n@vQGrqRQgu;(DyzZLgc1o$c|$FpY)ZFgLBvnoe6yp)%M=C-V^({DJ14qS4(l z1Q%^aJb%oOjxb30uP0ql1{7A9z#L8#JWePi+UXxg^g(@_FWfyFX$Isx8)I0uFEnE2 z*HfoB39a-{&{gXb{Y{%1dL2958}qeoene!}*4DN-U$%Lu9n`y`OEb-2h@0%TAmfJX zDMl}C?Az_pHA~$?_&{r}U}a30`$~z>l_S~@-Vz>5*?JrRJpv5qsc>8?URWY(#1i>YRQorU$i|re?!sD# z!Fa;=!8zEhVHg=8OcpgiZqA3{fp+Z|g|VUp%;nM*y0eUof7=EQZf@=aL@6JVHwihm zS}IH^3ltTC?w`xd6is3-mfVKs3|PTRwB)CN*tWOoj4wQ~5@GDAj9B^cS>^a2ZvoQ{ zh9NGP80zJ6Cg1k~E!NMqHy@*jk})m&XC80pJdgU^>7CL|<@v|5@Ch>f8ut!Ay&^My zq^sg;M$w=ji=2&3AGDORn78e`w1j4*=ELinWhvkN{g-jk0@A8wWO!Xymkt=N=H<(< z=TEW@G=y#@z5RHw!CJwaTi`&0Q1DOqz5-@8WpzPIG1wYD5UrfO)Re?XQsRg$T@|GA z5@_`n6-$gxoQKXA9l~Xw$iT|x$o3{25nXCvX0aOMKpbm_*yrlY3i=DpTBGP5l zT!%T4_~1O)Vaa|+5faEWft7nE33N!xDYJc0q;&FD##|9Zd<%_+Tx^xxjpGRDOSYf@$-A7#gV`Mem9rs z=^l*&Z5GHNfFU;cAn2?P--q|Ik9jC%VKo(tVH;<2DoV=8w5Qy1{oYix+ob#ReV58$ z9-S&LCjZ^B%r^nrsBJ8ypMP)M;x3kAuyTPbM{coM~ww^=^%A0%6{xF zluMvf*_dLb;!o4N5(IHHYRR(+tiCd6X#D9#b@eqVGBbb@+=tT=6B3v^O+}3I2MpT} zHzXE~?Wx&b`&^7pA3E;bl+<$Qt{$Kp-b+s(#OmYe`H!^XnuSA~Pj0q}^M(`Sh$8+C zN3(Z6AOSD4_~b%@Ue6mrCMx3goTNkoy)&&^&4gzTNZBw{zc zP9|#$`=`8{0tW#$Rv7jB9j8~K_w@dt(SDQAyaUTW0cH~>iY}xJv+K)Q8SPu0HWZXo zjtu9D(nxi-2GgbJrVB79xDfi?;9zYoRPII~#?|Z4|H7x`yq5N4uYn=gt+KaC@(X9g z?{q6pkqoS@Oys$|F;0ik6i6W*OcZeLu3IWnM0k}i;EY`aE;q&>NBY-Fn-*k0wMwz% zp?HndZCZO!9mPodK^7Q}>j;0|Ed)hyCQI_BUN0F&ZI0n zBekea-@L3@$Z}XP%w~0Yg6jO^9&q)-n;C-Gf4P?BR#OGighgMn@G2(vbY|Dc;M$?v zs@q$id)%viGgZ2gCe4@yis5f`dft>Y!g6 z=VMpbfd&UIVg2Q=2mpXsaZ}$s^z{obNs$lCA81t3NPCiPY-mow?)Cq5V zm#?URfuh-1a7HZGy1}=_DM>~~oS)aRtwM(^lWgZxDqzkoH(*)1>*Q)NvxP>(ubtxc zo?ipG)NHP*SfHOcI~^A?7ZLrecsL_1)cHS%{c+tHshZ8kEj;@3cM*R?wsE}Xfro$K z{@+i>X%8WU$8@Q8jOtRGU~c5_k$$Bb*p+FJ{SD-@nZ~;l=CA$yXxC{-OKceg$reB;gk7Xu`CH4 zu>p*%`}8|`oUi>mHObAtl0-01Mp(tX+9)&9WG1#Tzg50&e%DCxqtWAK4Y8(4j)M$` z{QUiJZ+MdGRq^;xipuX@@{gA?vlC{}bZhnkfExBAcA#>?pvg*U#pVzabOW*W-N1EK zm88HTgr`vM5H~-6FAO>N2zY-;qsr=qfR4QBP^Ih`zui3>tzfn&HJ`X(#zlF$-AcG& z5hna%7J0RPw@9L&WgUd)+W7BY>5Z`obq6bVuqOYoCMY zh8TX>%xpMXq?9!_4>K5>thHc%nuyFkjS9%$DTAx*^M@T%x@XaQb8v)S`y%H zSRL`b{U`V`dj~P}PCYk${oh)_O+ja`8yqkKR7g{sQyOCCvLG6xxN}8Ilgic##Vkj| z!yPC5X zTy-%pFigeg&K9pRAzGEaZ16JlJUD|qQ`AQiPv~v&c&t0Vfl2_J#)ZfSd2e<>0d2O z>B9GYqbJNbjaF|N{sdTtP1Mwrzapia%)F2~g2xo6c#NBmoV|V4Ro^%UT5}zd&vj(3 zo(d)j{nx$N>Cu{@953Ve_p)0!tmNWW)A+MW4nhyNX}KH9Fv4G-ze0d`#d7bcuL%|< zUN(F_{CS2anSK1{ey{`Hj{fsu*vzYk(L?Pi&*RNgMHAo47B!DNDQ9z7@iH0Yu;cg4 zDTE@Hi~xPA_qis$VW8z`MnXb@vsk&@1>LQ7$$A+v4Wed;4-Wx8XHbAK+B|%Yrf`(5 z2&zys2Jb4lAXV^J9DlF7-+^h-zjYdzA{VfqfI;`6aa#nX(Bwzm#>NX>>vzqbxOf)bm;#K;$mm;uZr?3q77&jU`7mrvPh+xip7!Jf-`_rLrcPZ_1ja%fTb z$Lkj3burtY;}UwS%fnasZ{Xo!Ha@uRS69%}vxA1%XZFeULxVrj3f-6ZP%v@-A}DE zCQu?q>gwvC^frc`Q&Krx&SqNnqAhftLazrN{ zYI%!w*BKa8zR?H_+}^SE4tDS|GHRAo$e~W8DUkhI-M{zf!oN+bbYfOTuyrQP2J7;} zgpFI^HMl_LAz?zF{hv)U8$9yXyY_EtczfA=XQ^r<0^XB`rfKZwYs`vZs7Od?3CuE{^D;^Ht8=C(32?&68KM zu}Rui+@W5$)C+mKYU_8XI80+4{V@y}{R6;|J47XTAd{ecCWdtf2nKi9(q#F8Z4n(P3>=Nmxt<9ULnvr>`FPf#S%j0y%*Qt!UT;Xgo4Sg(wo-MX@z`y3z* zXD23Lq$XifWcmd6!FS^EyZ`Zk%W}$az@r{<2gd52w^GOeZDs71g(Nmy1V?Z8+UbPz zys>WfAkzrk|9-g*CW@M=IJr=;2Zy}usPEd>@CD3(PRy*D{rSRTqvO-|!fZ+b+8r1pj+P9z}x-H+lV$Q`? zp^q~1&r;w2Jb326$5^i2hC6~HQo`!U z$v=)|*=iX)yC7tE2VS;xYn5NoZaa&NhiJv+>|2y6Eup1?C4eK0^b$$ID?5Ut7fszU z8bOnyrW5!+G#wrfw#BY1+RMIWi%!k<6V*x;RaJxLcF#jMT6J;Uc|vN&LKYfDM@QFL zIekL#*BkxU2KBpA2E9m^$rbA&5T{tMd*%-z&$oyq`Kng>g!Q3~W=!7}o3kaK_dYjn zM#*X-Qu}%O?a0{)k=v1jcdO9s2p80U{n=HO@^|_;scH&ygOCjoAx+)elZ}Li^KP#E z=!BMj$L+ne4(BjkTm~4TO{jOq(u&WQ>+2~K(QJPH{@s#&<6b%x-tnA}LA-iA_3p&% zQg0U-fM64(pLC#G?HxKUzhUnMV2O&UfS{N|mTlX2q3HO&_67^=E+RYYj6ojo6&k9Wj zn?1ApiN=r^&xO%*;8KixVIV4dUQO3=tn9f~+*;zbd4;Km5_2=a(fD$D=kqsqX%mX` zx4L@fiKptfyISkeQ_ip}=wA8Px?Ado5($OU&w)cz`S9tus+@oYSnG35l|-QtGq(Uf zGur`}690cH8f2wtyZGatf;OY;=U?nmNG~e_0kXBzn+EV=k;hMhJEb< zS*u|`*m&#_6tfZY_fww#H%~3E59L>s6CZ+2Fr2Pu>=4>2a}-0Y<_|cqK;qXJrCd*+ zBBbUHpQqh6J6GNa5Hifm(+P=2WH?H8OPzJf%6F}{qiyl<8>mbFe3?v_Jc~{ki!Dbm zi)SmwJs>auqeoz1AiS1~^7fRZC<4^YwQWn0txVdwU#@JQoQX)f)w*qA=>jV?i_m&d zOOh82X6$lrI)B>e#w^Il$L;0V?wkZ;@syJVTh4w6keE%Me=VV3x2X$E=U%If;e#{j z*1jl7rN?`2FVHnc5HT|C9H?F498?SrV~RrR)^#g0&K{ggm*}o<{`h^gL;rT6Ktr++a+AikZKqI)zJwJ^wQJp8D4{}`{ zc!`o>)5-`2`B;_Pa)CXCSJ74i`%8w$`&cj=S))+q?15MIm|*)wqRqGcPW3IW(&|5t z-d_>Ekp$CgBDOoEShE#4Zg8wsk=LCK1;=QDEKM2vf*n7ruL#~o8= z%ZqLj%CkSeKJ=KDq*elN3`F0~Zudmj-<$90K0@OJBkg?aiorcAokcG5AE?$NtOb2%9*0GAeR0ROyuEvAW@j1921kspI?!t zd(p|;vF~etHeM%a2FW*;ua$5e4T+vz-%bT(oic4_+{Oy}x?u0b$;I{O11yr4TfZyh zO0vh~ci;91_}PJ)3B$AAhldTrmX@P^*XUcb*`kobB7*^yttbO6@-1nOhmVWXH52@* zcGiZ_`J;m#D&5Mq!WW_LXR+tx`+$IeEf<=h#lR8v-yA!FHc6& zK;(r4cnohXNaX6o4rG5{FkDt=cyD0zU{lckxqC`4qg>k_C2AAY*ABi?5e&NjSc<5e zh^=JK;x;wh=n@VQ(T4DS-9pnYnXAKqt>*(Fckj$78Sf+H*r0H^?UrjV?z(<14sJUe zHn`Xe!Xb#@N_|I`DxLL+mZk7jXrp5I$#2{fQER(uhUXfw!Iy?%WO zHmd8_g1z(o-xESWS&zm(Zy5&?TT%bag>kLyj9&4DfMJ3}Hn7#}+|PX1?d^1{*52Np z3!#Rvap9kV!zr>1SkgEMaMxXOz{r#LnXZo6a(sBA0W-E-cJwdFf8lp_%N=x7asF6Z=w@?PoTHA}oDADMz?XGk6sPj;m`$Tm`Ck`3^ay6#+7jFnf-p_` z@4Gm$lQ4i;BWWL>e5)uydC>{2jZKs!NWSvg+WYwC?Jh%iY)%ebyti|2WbCAdAue0) z6-}C9@le5~+uCesoj77@4ui{gO(QS4@mcPbkC^> zG}z1la>0O5}iss5ufFFIA*y)7P({#%kI3;3B(tVk!GuB`{BGQ zti`ixgx>l;*c^Zc*+AJgH#avjijrxw1uEqUYlzl)s_9c3CRrN(5gV%+TKmhprKGOzj z%E~}g+9W#MZd34GS_u)Z_H2SIy~H^qw}GGvNlUjChqa|LuUz@@G;+caNMyTAp4Sup zk$$`4ut|VDJ-dk-_yHWAIeRI7b6VrW#W%!A=KhEsM${!~aQ z`Q*^Aq@kyS1| zl!H*ARmPRTAiq>RSl<;uyKpw6k|N!`KcCz=;yq-v^_%*(92t7{mQC-@0j1r1v#V2T zytB>qR&J^ckc@%1cBP@=Tq%F7PB%qD_Y4BqCI=^E#?Zxxl217d6TrLvIvmZum!^i9 z#D!xSPh3669tlr)X-UpX8GHPPH8qZwVRMgdEAb>xd7-0U5Np=2AUWT_NH57dU6>OwkiR9L;y6e7;N80LVjgW3Qp~ucd z$EV=6-^bNs-D#KKHC% zMRImSP((!XM`?6zwIzjwUWDuv)=Ko>MCtA3a3hG0m9>8^Asnz`5WbH7m<=q>m%@l-)80bt$}U5e%0t!a%W;p|__ zl|3haiTS3prIdcdbI{RZSP4!Dt2mu@JT^^C|FUvyp@XY>=JFaF|62fKz{VS01dlGN z9m|fA6K#8>>N&4mXfwcRAv&5i^@{WH0ny=h2PwKyYk)JJtq9A@^|1Kf^4g+m_DJsB z{9441w&4zV@9(9gW( z5^!U-N@2>CGq9=Z;czF?qO)-jJz`qY-q}F75Od0C$P_d(^n!OGDVZEY{u0!bW$dnp;<13e5Q@-oe(tzP-o-SRP>a!tO9bn&FF^ ze-UMVl@xG~qNE7S_>RPygiuZJD8ZZ9+rX@;z;mw2pl`d&c2G5mUx z;B{m!+%rEXYS*yMH%5p@;t`_PRh94>hw=WrBbWhbB!Inkbm5XGNc8pf6Q7W3sVRVw zI`u}_qXuJ#*8W((TA%Wno-{A`GF^jL6qe$d9+XQ|0f#NGRMz!hBpzhiPAZ+SSFxYy zN2qVO;xyc#huLhw*1Qgk#0ptP2+XQNO34?PoKEm8Llh6y{pAF2~2f zb`08U30Cs^9O-y7R7{>qaxtQBtY*2{Xq2r|(V=n30Tq?l9$N<)X@+|?2jbB-0pb8& z#&R_ey?%x14z!66_rSbiQLvid@86gzUcLh)SCwT;iV09U)%txv{f*v^)0w z;%+RuVaTTlWEuIp)pmkE%IcHoet^iXCv||J-jdBm20}`?f@n^zdmv`obbb2D6#=ac z)h8M0F+G$&^Vqvq2I0?M9fNpr-j(ixfG(~u`gWsc-Pu4RVTVv=m@7uU;2-hAQe+g84~Dj%ZTKu zIaL)c4UO2-**{*XOcLyf-zjsC}SvN-A6D*!mr6%%=Xxu0GT(&-BV`c5i)q zNMdo|pMUY_M$c)0`~w5uY&3Cfa{%o1VoPI>e3C_h=*DkWCkrpJ6X@Y7Ku%7t_;;Lq z|3ph&ItN4QlH3btyegmtVm&#fs+Pw9E&$Jou39hJHyLf2wmL2bDKsex;4r|<-TCoy zi93JffqvO#h{uBA`(S=|m#gAO+?=vjy63=G%#B5pq&xaeU}WoMjKKoKu1+%#1wQC6 zvF$oL7{Xaer`DB01rlBf#>|npk(;xd$&`l3PcXx!ZE_Nn;Gjt>3OLuRwg zAkKjw$vqh*O_O6EX-B?IJ>w^cZ-9&jK}+1xw(l2<2W{l{g0+ZINX{Z4ulllORcx&9 z^0gy^7%7y@UDtY zGupfzYMoCpt}maiI7Sppi;u6_^kJD7XtQYNl+_Os3fPPgQSt^GP*_kOj9}=sO7wP} zIl5X2G9s_=mW7k;4r*^kbhy(-I2NS)nU;N9er+Nw8LeVE4fKc|I{l-x-al zJ5prO!t21VO1S9uj$A?1{?1g1)^znLD%U#?T&6;HML_ZUTDlTOZbA?PQ|AZu2L1nL ziDJvLlOPBTV)UJgs?xYuk>7KsYdTZuCeZ#_H)Y=ZAhU~js2v1=;aa0-U~r@Qrj*CD zTY>MV!f@t|I(H3A_or`nIBKN-`J0}0s&;{mlT$%0&g-ycRTYY~%Z0uaZa1VyZQS24 zIHvZpvLGlXCd_Tx&CMt+e&yu!=b9vS5UD3Ch!|D)BUq1MIyz4)*@{1C4@sXpf#y~{ znkf_!OSVc&8z%)793uTJA=#y9oeZC*Tpo0<+-2K>|~t zeA$90f@Uv0Gjlhod%|TZrvOhWzj_3h^7XE1r>(_~7kx~t#W}Tb-es)9$Fp^5_ChK- z9J7za)11W!!H?nEXfK%c0&q|3I0RY>Hu=T%1@(g{{4LaEcjR!dJ!pUFOMm5@sLzCx z5T&jnsNIs1%^}7@U}d3LN~Q(Vb@}q;M6s&-oj$1O1$sko*+d*~e=)qr`|rC&Dvf>( za*WBZ555iUM8dsSY2{V)Q8e)Z}jes2$ge3laJXe>L?pAGEb{*+}iBM|}0c*!)iv;eE9F*0PK zS1TEW*u3Tk$LfRY7#T=NNTg6nbxV+--uwVW#>`_IQ1}dxEvBaC{{F8p5*I|)#gKq* zK%Q-MvOA8zmix)AGU6{*3O`Vqgu_tc)att*m#5M9jJgYc|>dHOV2s4{8$KYp;<} z6o^HBvn9r{xC|XBd2o-EctxHw)D?o;1!&)ad6hT_qiFbdhC0R3xF{%~AxW&}Pa8?Z ztgclrvB=KDQ8oz$=kc%A|EW@4kFJGv@P{v=&f#GVJcM5S3W62sx({^tA~_D|#xxW& zV~%X)<=tz-Q2?=mlFJVJE=A~NkioyZM0#o&nV&%%n1y0rv@uBI!9!o z8m(kqV#!yXW*Ko{#!6YkuK5nRB8uB>~fQyyHsB=UzCm}%7ld3C9Kc3J(}N56Iy z5YkNK*^n)?u;x^godvD}2*0^CwvG5*JNPz~y?t3>G0ZD2u7%azZF1?-C2MPIdmmNZ z?ofm92EI^wKFuWs%LRYbKi?}{Fv9{XW;l|Jg#kHHxq8OWXVFb+7?A-u-{}11D{RXw zt;#_Ca>p3S{{G}|Tv{W>9#g|EPaO2m0NKB>n!uON*(fK00 zJ|$WKCpr#upzmkSSkC+83^HH5azz6EG9@<{7|geHtfV49&=i z`yV$gi3o3cNxAx~v}?cr3HD#_=GVDhCcUH}s0r_gYj0pva>*RR-_#u9zUUoMP5nji zmn*wW2HFe!qn)7u+2L+e8}|_6o-h5@{UI?i!dhXKr`<+USG!g*f?HaPgI_tlVtVco z8Kua9kiAc#78LU%X)%zdYrZXtn)31S(P8lBCw+KPoW7@fV5N<5L|{I;kaub685jt? z*#=wt^e4LizW%r)SIjR6VxLbva)y-~Ec6A3J0>Go3}{ilS1 z@>qeWGcszWPQ+4WLhu^QC@=BXU(pS0pYL_dXg+X&M<$S6X1j#@#) z2+pkIgYC=&LQ%66qD1FYfq%0uOhL$&kx9eNw@4{vJ&KYB{c- z{UCP5AkSgSzwgEGb{YN%jbDpPet!05lVH_cvl;|)-hJN|i~@L-^`8!*sS_9%492pEyU41(&Qm1GAB~$bf3Fa!^!Mf73Cr z8vMAUlbw@T9qKFY_^R(j?oSQFA1+z+t^Ay zi9Sujw_~6PhgXcmr;C3-iJ!q`2>by{69Ms;#-GJJModxD3DlWyKWu-dvn%Fp$eLy5 z)GJo}Hht3ICK*&I+QjC4)gi9RMrVV{fO$E@gMt7iZt-%g?uHjWlc^O_${*3jhtSas zwqp-j-y~&=X;_5#>01$UI&cCAe#5`s3<22Fv9aOBx)k{2qFPq&OR2VGKgD9C$$W;` z-+l!9?^f`q{J;P9=k+pV)pkar0fB+J%?_1(=wV#8#sb~p@%`zl#5jY{HOsgkdXDDv z(arbSoeDhnNu7wImfy<5&Fz#mBNqF5#-47vW)pr=d5n5wf-Ba!?@jtqs~G~?$|58Z zB|G-++0*BIy;bj}p8vYPLp+fet^5gYVL+q9m-gbfma>rSHK*uooIle3|3Jk{-WC18 z{-L(!iY{w~MpC5Ay_mZ)ft_+gPe0jO`UDG~E|xTVp^Y`))xy>V6q{M`j(7~kD6 z@vZbA!?L@j(=O$HyMDcW`V%TrqKSJ&gXz%TEY?!mP!b-o9BM)9zIGI&kqFTbVR?Xa z4%|}upv$tb>;m(&F79srdzu1w-9(i&T2aJrm!%RJMM|CgzjE|}LtgRm z+yw>#{FI=DCd>E}=Ofe^oLR|eF+bbj{3?TcHGH?#IzoQ62u~=HL_|aYJU2rd^Gh&` zU{mx)os=QS8_~JAozk3^mIkt@{#0Y>v_qY$MDn)9OoU%|?7`fWfAV&#JVE%9Df^?< zRisL+jU8M=l#6bVj7AQO&f%ZmKM-0}0zDx{axc|DegLoh?lrfgqN1Sm73So;_0Ni( zjObHv1Vu+j2L-XqI-Auhc9yI=* z`Q92D8iLG?0NUL61{3RcAEenWCo}V$mvQVuc!t{2^Y-fZIRRKIwSkiKE$>Y=1CmD3^E+WnHIWu6EF9}SgdO5ufLY?DWL zhip&NO{akv6cbqW3^DEg3qA#GByXt!sPjdV(>vULsKT27{vuLv9East$-NxQ7dKLl z<&@HglNEu6v9=pL!Jcx5{rlUs*psT8ZlaAWb~Na6bbzYZ=c~daIH!r>S7UF2H~Nk@ z>leE?AaoC=^N8pMF~*;nz*Dn0vlm`mL8o^~%0GKEGf_G-fo-2YQvbn+V)^*UTf2Y? zP^cyWZuS|bS3ybb3Nxd7!)SM~VD?`_jDWD7Ll;?c1N^Ys(f7wecLH2sWM(PpgYPea zw{}M-^${hT8#}M!aa2!7;Xr_geu$7{c>AY=mY*r9QxAqOc+l4M!Ld^~h7aUql+TRS zcuWJ2+OuoJCt^@^NrScipmH}^vZ}hwSQja+R2-fhki8Jb_*~%wnVn!gX{YLvJ9qm= zvF8lKNohZuj85%60xSLSf;~&gB%#Qxd_}oCE0Eexrqk%U_I*cXUr8;-IWjXO1z8Vv@D`iqnYudI_bF9xwW-Vq1oEZwjoN}N zjjYS%s+;b;78~YpwqlI!HaxC#vhLd+W=G>A03dtc9ZQ|+mPj|E$;NMV6uagb$>g-8 zyhrGVh2v;_$_wwwoA*vl9mfxl!{;<7wx~|ufq{}CuClAe6l&V^T!W`?^Xz2d^p=|L zVsy5yDq3GU*iQd$$n|zT3mpqk{q4Qe@Jnq!wT?7gGc%HTRLL3|cv5SZj9mq_7`cVZ zi2|vOjGzaMiic|gz>Q%c2vT7mQnbSvyH-1{yQ*%{r(XJ58kb~J5Yz9%R;N4R zL4!cni?lj2<G$_1fmc^h z+ZR^uuKg%m*u8d7A&0Dwn2Pg3|JG8NX%x1~gm~|IJfG! zKMX+*0+SrnJYEoQ2o=Pt9odx=m;#>W#ur@vsH^DtV?*Fh*`H;8+^ooB}e3Cc0%{w?y7{7eAh4Yg)YNOnbHP$JPT!ExiOSxV9R%6y!b(GY@ z4E(QCkW^8EO(mnknL^%)3T0MpA?IB+mg!LG#z>xe*3|q`t1&An{%8tmr42M$(Po#f zn>_z0aUp75aoCI4z0O|`He~(KE`BxB({n^e*~IUVl}6STbH9|MFC|(}_~5cm#p9~q zPX6RA@lFj6Udk2pSbWB-LD)nf#h3E{^C4DX&e3E3MEPdl!Vc4XYUJiHYtKpY6pMm7* z2#xV;oopnZQom9p{s=!zo$TH3LC)=<`qY?-yCDUl*Jr+*l_9UhrH$$jhe#}~sCfw) zSC{w|w+~8p17j-88;;k-Dx|iloZKO~Uvu~Lt`xQC+4NEHKVy z+vwV7*fybcC$IBXfBUyRtCsgg+BDux&nV=(T<}8nKnlHckY1?qsv&imyQB4aEt6V& zrEA-MUbsJY&uE&W+1kkB>YAy%2q{x;<|! zBWjKtSI7fLc&u;fzbG&NneHjY$)0gLD9;eY?-zEIm1T1&1fR;1f&h-o-ziiJjtOK? zg^s#cxe9kHJ$NJ~czji+XHnhrpBmgE8A6?j)+=Vy2(R}X?CSd~4i}wOk0}na+OZu| zMlfDrg7)evT3$Q01S9rw99rfqYsyE>4_@uP`=leDkKu4z=gKEn206$E}lGU#saJzGWjM&hgMv zXlz<9N3$mCp4P_Rj0_$%6fjyFr!zdMv4xgXY53*}a8C_O=3ciesl2yoxn;{t;;JCa zQ`y;HPOFbp*OzR{dij5py>(QS?Hetuw6p>$ASK-*B|VCCcZW1cNK1nSf(QZmuoo;&vVC>``SCMRD*HBtMHRJhB@rKG)7ZnWaRi| zs9a3J#rLrOlzrr|m>n3!)=4ImOUAfgx$$bdjVI!4PE2B;A~Mu4&8@)eU3h+I%va08 zLF6^H+|%W68d8pO==4G`%QV$-J=EBjv&|#^1ba?j!@FQMiPsFDuO3WA40u9$a_-ml z>v<|f&qh?mthbym!J^fkwpyPry?>wY!gMAlw+vX}OI6?By8igF4+X{bItm|(I>kdd zjT#hGR3ua>$q;@PrT{j%x9xcM2_H%yC(9P$5X??=Z6}CZ$4QjDUoNS#dfJO=$9m3u z@?GSJbp@&iEBx}=gN<(iI`y!}^n80oO2t{nPav*P=ongPS~kqS;>B)RHr!;Z?H#uzo6HCar-n)Kg|KbL%wJm>@(S_QzL_gf6u*nI<5drPH3q@SdyHAvNjnd*9&~_{sM@YNlDT+f$Wh(acwrKuL|=dd(=o<-6smW}BZV~D*$ zz2n6qB6l}<`Mp?BI2uOZn62huoO8tW(WTg;BvbU3!i~!xPS;OU4G$-#Z2Fvii?2Au z^Gt;-`}8U8usC}+9!j$7Ouk`?WqS-YG@w6!R;*q2zzAl!Mk}=)U^y7_l*o}u;&Z-+ zl**?k$55D6nrg3}*Inuy#sqhlYLc|MFld34s^Y8Dfo2;C7-K=qxW!vy&Q0HK4oKgdCu5Pq}J@Ev1?Gj-u2$GQ(TUV z6aBjJlj+#a(us%o;lizPmycX5ORq?xraH@aEFCP~>FjGqkQK&coUT3ToLb%0UQRWx zu$T1`jE)J3hH)FY`|~X*lj@c99SO~h^Q66}FLxr6lWp~K__)2K;nbBe>+?tz1c&HO zt}KC6X_eq716#05wvwpm<#JD!9GD&ZU@(OMS56XjPC~Pip!B-NAL;mc8i#Nj1*>aodAiC;T5|7EA_{?HL_N zSrbriX(CVrPo9u;oP`_9wMvNd%yoq>OgfvGrk$>DmHEQC0Jz9LEVQrkJ2T#&V940i z-}MHwnyhwm`-UAv#lA!#E-cnrkj1f5!7r02!lY6qW&hT|HFze zOs7H%4vw<`X^c4SXcjvXBe4{!nAUSXMkz)NdwX_D!z9Lq z*UyLY5<08HE}J=!F=UpQYTYbrN85LHnv$-H26oz(PuhPyx{1%58xK2o2gxVEl;5V! zgaV)-3JS{GNfDodn*bVgD>M;I(D)xlYh(f}!I?KDXshOyJWG#>xB5jJ@5B=cbX2fB&L1Tklb zg?(XZ#0&xgEg#>J&Y(q=l3u@R!+?rs#1|#lHL|!_7&A$3dSm5+edR>9iHyV;)NT%; z@jkRaLsuXphbT67Tk5DGm4;^fvCAXgc{Rn@4KqVZXj`XZj#n!qA79&sqG{+mGi6M@ ziZ41oOeGM6=het`7;g)DbO`G;?^E`G9%Kj!PKb5v$N`(qBZzBZ|fHpyx|M_V;YJv~PIo$*OkTl+RF^fRaxB4QHW zExQ2Zmu`17b@3l?JP59!4e6aCMxJ_Z_>2OYl^7^fH)o&cuV-_lLm(|PP@Yxc>+28^ zEek~h1WEi0TT9qoM$3Oi%vmS2PNRKABI!b%-fI((@;@sc0Echcz} zHYE2iA02DkxTQyx&Yo*wZQZM5ah|8Etd(am@D+yDDpL)OsMnc^BaMTd^(m=Y)#hRm z-(HF%-P)K>N(-`m!O13pR&D(ACyOknakC53FHnxg>q&g!K`6izZwUSCaJf0Lja`Z4zNS@L{8CT9Pz4t-|k z@=&~SFASy?Z_<$%nG>T4^R6g!yCd9~RG_Kntop5pUUaU8lQ^BH-NfoRIW417nY&b$ zk+|5TB=N8g3G*m)&bwId6nY3 z53OTWb}W@Y(I3lyX!SUYJLQf^d?HdHa^kto{K`Meg=URJXQy4Q1Jkm*B(YEvMo0}E z8h)Ciks`NOYtWoN;?lEbpsbrA$$SLa4iI2C;<6r~WTvL4XJesY;iWlF;Pr}rZ;#(V z;eRxc;dX@5JF9Zg*y#7C(gw@~#1shtW!V6|J^(Gyz&6^xoxYn_Xc;ICmm47ZT%G0W zs_f*r_oCpN^2+PvQwFN5Ii3kTacOZ+-2Dfuvh^|1&$#oZ%58aef4*))#_8+n*#cl$ zR{**Z;O{@al`=hArfGT6Z<&|%FTik1vY;kA7?)5mW~0yC-AHb(dPVB7sbl%f`YGfV zu2iraH!%w}1{Nk3)(YoV-qVCbtI0bCKU^PH5(z7$|Z8Kn7}YMDiVN3NFx9p~AR(8~ri_inl?qWeXymgpzn3G**wP*-Ta4wOi9Rc(omxD0qgF zqh8ZLuH3P9q$$_Tg_z^@1{2;0z1WM1+hqU~15hCV$#shGZfR+4FkZu3y#ueiwyQ^C zqEruGo+9FM)4I8R{5HiNo94y}*&glY$2a?>BfNuwme~dVX>z%`GT+;^Oj+3n3~2aC z^ne09Mp;kgmxxIG0V{3BZ$5E3)tVbsOz~VS6!8lP1|3^j0zwS<392M|=FUF(H)zH0 zpo@Za)*(N&e%ig1wIHi4=(zhtT3Oq3f|HZ;0yS{~L63k{t385-%tn+zOziAdi=XGK z<((B69IaZ7*+t{AJ|gQW-+^w_&cpRO@TJIS>t3^S+hCgwEN4_=VKNEuuUZTg+&1lL z#;(TCdxCqC8^<7#xQpxrR9Jz&`5KT<@lMSguq4;jThl8|(Ss=Mpj*UrDM75XFL8ws z9OU&k+BHNDt1~077v$EZWF+tr5cSA}Hj8a@u^Uy!rL|WbVTjL42p50m2?+nwyM^!} ztxrK=U^?CZU~hg%$JXfaimZ%gi4E1O)7D@(Bu|8(%ti2K$Y!J0EPL_H)A1Nt*=izRB49s|LG5Su`_tfNvG`^L(;&U$&EmYB zsvOBZNm#7r=F|xCAQw}AGv0e?B!|b4RqjUx2mJ{;wD;GATJ8A_3hMx@ygpeR$>lbA zp)&GA0IKPX3&OzeY-zoPh3xC=RqgK5bi5abgSl&G$1sqZaqrU23*327Tln>z}Bnse`4v2j|v$z_iN4R|jUXM=)54yqY zVx-k|JxPE;rK@^qu=m@$h$t};!x5DB#KFv^xe=52B{{!yioZS1_&9_qKpF&K-@U!P z%Dy(bb=fNzYrOlG)RV-SuAkxj^K|srw@9aSqH)fue_qUqS;Xw`@6V8}Tufu_-QuC9 zyFf((asi$$bgR|uihZl01XHQ0FM?nN0Cp@^M89j0a?1TQOZjzlTA_o5PiR(Bm>hVD zWAYc>X zLhT}DGh;qieYo@DAs1aWS8JY031L{A0P{acBX5W@lnN5678d>G_H1gov$6V{OapeTS4mu$U+oq#_A~sV*tLLqhhAK!YYEodokJWxH>g+Xxzbtf%WX4M8uleg^D4-sQ-*xC zoZhgSU)KMsWe3>(!R*Vy_BO+=C>Coi7b2Dy!~y4@fb{GZuGmUvB%82fyUKIrb(lpE zi^piX?tOSir&UzH&tE6&wpF>;L4t+7LBKQ!aFusj=wl&@iXoBsXP^U`SvrK7%{4th zx>muZR{a!6HH2l?Ci(WrvS3qd(H2v9dt;*kEt5Q*Dv!PI%mza~%P(q<=A%LAm_8q> z#P8$hhX(F0P+b5DEW1Arrkb=Xb`MuaR(E%21{!s)vRe5Wgau(?y;v?8-8F!A1uz&^ z7i`-WbR%OSXG$i4M@(lk<%V%VOWJu7O}dCpNL-d`1v&C?+FyLR+LlhbZ~jLWDgl7( z*4o&34~!H+w;G8XvLuDq4WfPC@uOAzB|`J|uEDs+V`l_r<-sU7IvTB7)3c=G_js82 zM}JPBHd6gcbwsBI>&Hj(MU#L&9gNR$nsq}y$rYp{0CZ<*B>vlX?-;Cicb$m23iQ$@ za|r(a@G5e>$!lYKZkDJ*K>?Za*=kU_1S+0dpvlx)qsBWp1BM2hJ`nDJ37v7+L}Cktag02DnH*1LaQAixUU@d5Qwz-F-d zQ+RId6Do9MVN4_L5LuZqLbr$R>XfFV9}eYAKG$z5>fgLQq|q zFZDFkz&JyPDxK|7Q7P^1cl-%B6Wl8LeZzLY_6;fJO~`aLw|{ChUYKTTK)O0w@e~Lc zo+NjO+;x}R!IU-7-BY?Kji_XbM`_}J>wpB1-qyKjg2p4`Bm3mkq;292kcd zjlMCi#aw{kU;93sy?c&P-@>FfSw&|nXm6r>Q|zX~cBkb;1t$YTCMch5OB*;MDJm+y z!z8>f`$it$ddvNXjj6|Yg%X~K;FabtE}iprj5hR+F;4vG2g0v68lN^uoV7GJt3y3K zfM(UB5lAZ)U5NML@)_vY-(k@XUf$m8FAEB*IZaxGEAr*GUh7`yEL)gRz9=IhtqAeN=hNg z&-|UGm;rN7U0s3%-r7lS6FKcd3#pj7ID8E+e<6er4N#imkn$e_0vP|2$z!w4HcD5< zhZ0vE6>k)DrZ2@NJvPtrNmfRSR{$zzb!`n?iygc&r?oGU_%Q-*s=5%%5_i+@&pl8b z!%c5p*>i6b^waYnAu1RpbjMu!wb$6_ADS3feHRHdk~NwCOu}bzuuR>msh0lW*>vq> z$CnISG7`f%& z!mp!1m3{oALai zaA9K(jp@~`03o?!_SH^A7g{jl0WkS<3k$SeRGmto{fTIxh)+mJ4<(YHl9Tq&RL-O7 zqJl?7GPfU9Z0vT0&Wi~6e{wDWvshVbc}~vG5vI59Fa>`K#w~fSrm5Wjvn?WGb%M`! zQy)jH>6OjTEA%6pS*o`%#9Jwp2IZcXPFPbRn285$`atjD0R_)V93%aE`g;cQ_=2sM zTN2NPP+7`!{Hy5z-}gFz+UdC~CW8(_6{RIR-MHVas~hrrcYy^q;L!oIoQ8&mbDYlW zee=IsC;?N)Y*an`A{`+}yl{I#76bZ3D+O)fy=fdl`x_M#*9^5@&^Y$g}a zxW_S5FoQCFU-*I*J(;#ODgp<9-(7c7D}T!l(Ch$YW|--%1=`?STMsOUINfLzzBaPgGUbW$%sCRlG1gg8U>e4hCA` zW8AE*Sy%E)**Q6LQAEUr9S}n@ERsu!Nca|me)9-e_}yKHBQ&up*miK~ajU)G)z?8P zkeQ)1u#7TZX>6*f9*_Lwf{Xh*IzmWzCI!dP|wO2 zXT2ob*dT*(0f2o_IRWpOg-$hDOSK`7mN>+5 zlbaW!{FiphTlNN!wW$OaDJn|N;~Br_Xryelx_7z~&X>C1j%G2fcK&mv*SSTk`)Wx^ z3BZ+_0pcY7P<1pvx!{w%72321Y;3WIZK%v++#!GODi?aT_BWSK9H}`05PIASz3=7a zio4-PCYn4c5g$ z^@92VL}Ajhvf-j;7ACwd^TRYb%aS~B;}t)Efa**dcX-~lP4S42FTmp+;A`HN76^g? zfI+TK9B)DGWY7MJo)CKaM%uzdxE4yx&Efo?wMAQnb3XtG09Y>Tk*!D1v|}O)AN)G5=!l86@Z-YKkeY6G61BwoX}0 zf8TSBK)v+Pcg{;T`~V7&^5BfKad3p0T4=U&AkoLzU(9SC$xVH|Pv)1Gw*g>M0{r~5QLe;sypEFYPG13k#f?XmJhD+| z=A1${^uF+8h?RirHIA};Gv3PnXL- zv)wP|7wa^=R?ojda@^Uwei=&Oz#0a2lPvIF{e!YX-^vv4W_T%?=4gkhWIX0;@{}GC| zyCM~t=bmO@qZc$>0LAXlhlS%8yxHB+b~p6 zkPA`~ zStDrn`|zqx2yZ0||Y)^~^Jj2@3N5_#NxG=}N0!5w__ z-`rdS0eBHlP^+2sa6Q-=s@%0=?4p4C`WXE5`2{Nr6TA7ifZj!*!wa-1`v62rg69|y z7zSs+X1u-i5oRr&^)Yac=Z~KaMc_pA=<8!=iLIsyvy}-zV zeUsbVV9rTT53|L!*Eu^V#>*xxfI6=(A?L_QWkPgPe$;!PM(jRvF!W@|C)V`O?m|Gf z0ubK;wx()8qHIonrne;MI?njlLpkV9H?hKoD@^h@q-|_AE~e{7MpI+gC;}zz|8ES`?4CuZl%pyZnE7T?9j@XB*VoF=G#4r~(LI zh4qI*?yXZz8FrUQUD%nHB-6-C?=OzOB_JFXYe_(|FFR~OV-h0Yw)LOG{QjyKrTvK@24KM~Kad z_;gDM%~q2VE=UyMePFZdS}^^_Z9Pth^<#8!IVo@IpMt9p0-48N>Zklq9Frj9VFfyB zlFRN7j9bG1M}YVbhaHF8TC9H)ti|`XABZ15 z{ZQmI*BXj~Gk8S74(DENmLsQUV&bKvgJ$u5z0mIh6n<(%`jFRL!eTPve@nkM2+x8Z zT8kN(&8K78?b@lOn+RgM0xlQ$n|QO_c=Pt{(a{k#HMPX+GcdyZQE?&M5oqGI2?Z>qWj?h*>6!jM(JTxmz!|y`WgP+y`={`=qA0x7mNYhvl8?fp>=?y0c7$FSRIvL7Mg>6 z#U=iDC%NklzEprZ!VwUH2+I2ryt(O%Ef5i!g!9@M4l0UIg)IMSVmo#y3w4iv{9|5+ z?|?9u2EPHm@ukS54JH#VkqPF8M$ta00ymE@X7{enzc&k-G+;>;{J<@A*T~$M;_qb_ zz<=T7=>F^V<87Y+5y$Q1^@di79$z^uy@SR!cq;}%;={nn0Od`ER?UJbSW3BN3no?E z)Q0-SRaI3Dt{6QnOZH~DckchZ0xn(Z#XJTpbmz0e*{sALuWaw5HtDIrvUvfU z46VRtq!p3F(T)&pg$s+u-$n43=mBH>>K`PF7Q>rNyNuIdFzz|ceya+&UbQhLuJcY~ z45L~j;A^m(Z7wj+uhJ}crxjp4nJUE}8|C|~MrL;BsyM@;O1}H}c_&{n$3yXFVSb za#1odNfW8T)GE|6Xa1U=Yd+WlnkZZ^x6emG7Pq+Rg8~ji{!s3A_f>9l&4(X(CM3jD zv$S(;3I?b{zknG^pfk6_l!_R&)iMGMpMyU6A5lO_Ve;diGZ(14Vx>tjz514-+z_)0Au+dKuFI`@;x`e>fXk1(b*6Q8QH<8@pG}!X zFESf?0QzOg0PF7$7y`v5Vr~@l_RcsQ#MJZdMBcrc`XWf*zWX24qxIQ=_o3 zyPzZR+b(&_qGMVQ$r7mnX2*{*{&v1jjQ0hilQ5H;q~WK} zhd#6Dp<9HPxWb|!+#{A0_wO43T3h2mhYYSTK6=dxho$7V7{pa4)<%|=8C+s>SHBHp z_wP_prT~dn37~9sY7Pdo2Nrwk+pF$R9ZW2EO$lesg=wzNE&t7LC5~oO9+6>!z}5hz z=oF(7RB%4r>^1*U%K;T&RY0+U)jl$i@0dpOH@3xq2wZobIak@lxThkNQ~ zx!RtV8WrEh-T|PSN<&2@F2RfiDA0laDUD-|Tau{apVNzm&xGr>V^W7)v}gg&_@b4l ziR@G9@Y{~jxmVFNga^GpvoA43jyVGnl*>YA3UG!^|Fydkj&{yY~do4|OAL;({JcA&}zIiiO98VKZ;d^@3oz9K#`F=HS=4#eBPM)_xB z=+wz6ICjgwetp$u);4eD0rc=tb+^bUZTU#a|Xlx+zG2%-=- zjwv$<8eI9-rvTqHg|dzM_&+YA z;U*de2A{`qk%MU=*WN$p0fe0ymh<227i9DAxBd(qE+n0zEksZNStfPaz&;XNR}lB%R0TfD%_6KIYN zxBau3o(P|_2KTeIg)1^6mBo*YalduMV+NbUi%1uaisa64sZE^wn?TLMPqgrAA<#TR zA`%j?mv3i8FH=BdT?2{cWL9I&a_c`|ejOoDOE8KDOhm>2em(E+nzr9<;#4DWTP~W8ijXnA=xCBuztxn39a zr!xN&@i-_=1A?O7TImjHVh)VpKk1A&qz+*(E|v$VAC+CKqsruwKMVH2X%%OA=`I#L zN|B8B>>J%azsKt!9^2peh@M}YxKbt#UJ=DZQcOxLP*mbfpzZ2KJ^H{B@A;g)5WP#0 z_~7audF$Rl;xMBm%R42ebMZ890tU+J+4Apl)>T$?fDED~8mx!FXX4}iIY@8OhetKk zfY%@h?P5%pwz^TjVw9YV!j(?{rx)f8NkGsQT+dyXRb)G=deYjTOU`ruNhPne4gB$5 zBiY$07P+(B1bT~m%PQ}SOk!^TYXlhNvt*@QqPCBJpTIQ8e@>vtZNM*7?S53?5bZUw z{Lhc2QAG&8Y$wRPmwvU z_qIP=vfa`!H~%ui{U?dlqzFD<{G__e4wa4NbC&hY(Cj$grut*(<u2PW0-(q)AiqYRmlf1T(yYlcH|ZC|0$ZH;WNnRq?T4)bS&p2tlkrZ-k(ZCO<_)g z8!~eFj|eJpu|jkvcjkuk4LD+M@0Hzkw#9(E2l5-qA|mgTC;_SWwme@DXK>e?wm_!3 z#+Z~aB|NP|>c6?Ug%#%C(AYS6;UvN}^tDZxdvM|3ap8y-lP}oZQfx;?HTcSTCI3cv z7g*jrru|F+cY!t&{l*E(^TeEI7l( zPYbLTB31?+a#XTXL7VE-Vx%5i6k=0Ncu5}IXVA*4uQW=|cR4DPV*7I<;#0kS5_}>> z#<8%vm|5d=(luVw;A7ml@u{LNSv-*(zB+rztM@GHPuEtoMm34TrJ7pP z2J8qK0cqPq;g^{VQD72{9^NPw{ z9Ue#k(roDcqtzz??}En$cHbc7tgw^7E00(=0RVN=&Mt&EDKInYJKCVz1bnkXZeX>08El#48Xb0>=8& zy^`LADesOXB*wgR_+WkYINnY-et&?D=B_4NFyK@MRB61tkLrEj3~OleYHCTbPWJhq z_raU<-MH6jwX`y5!Q(6Lj!b^CXH8l7lSGQ#1~47^Z2@WrL~xnWLM;Bh%u+sJ_U))_ z?Vjb|MEPQ&`A%pH{Xkb`cvzjUlJ@y(KPcu0u+cM!xqrFbGDrl{ELl!-mt=1^TPz?c z0HWH#u9Zt|tXXa^|5gmZSNR;c?zsYprDXBxbLg;KhZaX>q``et!=`7`M0@Uwn3wCN zNfR7f$A!r3X5;|Ro_{~)IRUQJwK?2DuJC_s6u1xreT8T_VxkI}V2jP7h-l$Ic<>-tudB5+gqBlCRoWjjj`ZXFK^$D#+gtPoxT9vuUi}-=C6;XQGfJ#` zj0y0kAP;7g|EP3-@Yz(A_*M7 zz}gonE}xu;UVn3UQfls=Fy#r;27cD!ED?M+<4X|$vkSf(6=|bUE+0PBUG2>z?}Hat zXRO=FC+*`z|9)Nf|4~WtGPm)uyZOT2{yCWbbrKcTbSPT9mPQx`f_nw{fpzwAgPpyX zXs;m+vCubDu2%hFw{eREZqbqaDL|}e5dYUp!c%j2knuOWs#QWbgPnvPA8xJft;AJ8 z{sgbxv_2ql^a4W}<&<3PrPB~Z57)yFS4_GwxAdcZWCdhV{wY@e{m%dUzd$mp@aZRj zHV1s;dia5xorA*?@QD)Q_*$VDW6Ik9{~z=J{*@Q6ILr9WtEqrs2ij6?X<`!o<;xeF zp5AhOb>0C*Fy;KmWdFbY%PbD8zS$IeQq%wZTu5yuvX(4KH92*_ox9)H_B4MZqF)-ABk|^A69*NV!bA7OeVEg|&J<{Wn!|yd~1@(~9NRx5Qn;;lQzjyeq0a zBSi&Si*LoB#XxaZGo>OTpt6sBFq^hiE_(AM$&c-cdpA5RT0_{XT>;xQC!y((UJGy80gIg?>tShODHNg-!rkuG*`s3_N^l2b;Kc zzn=xlXDZvb5JaWoQPC_bFk7S_QO;}IQ1$&Df-N?u!wgO<4fpXiA z@9SbB4b;@eM#fQu#C)T}FDuKCUb>a;9vnD2do`+-{U(uz*dv+9kPw_Lp0n>a)pj?7 zz^Xm_)!WZGfjLP@g}%{ZpIcR)kMV5AJo#C~!^0CQw46l8Z8SRs;%5b;d78q)LMY*w znyvX_dJ7n~0PvO+hHsYZY7|4Ak~00~<246T8yAt?t~{vvxE^*B{vjm!eVO9<5K65l z`8=4?!x{}2^!w69V)g|zxvVwB-0QR@a~2`RLoDyKd7X6xgGcwCH;4KkT6D`a@HW?? zc=|u2l9GV9=L&di$K9}fuXr{S`kbET_vP)@y`KN3VwXnbW9NRL)*yi!0+?4Y#WT4M zrHbg(g*u;PO3OIEwfypPM3seJ0+%*rr2S?iVRt+F)lG&X%= zD;54OOLFAgS1_#KS0qynYVZR#ntWZ~jeZAZW&x1ywXw&g)}Iw^CP?mSq9W~D^d{>L zWTy1Lu~v4QIo>;*H9At)1>%&k%JsB)dUE0tpG>jM8ph~mNxSgHXsMXStKBAglGxtM z31Wv>eE1Mh>AErbu`dm(S6T8dZ6Vlc&6WGPiGdLn!DX=7t?2XoPLL2927O9rrK_@e zceec)3#(d&EoU@6a3T zj48`*5lFWAeUXXi$aSwl4bMEcl&ZtK`0#Ww(!0(fSniSzOYb4FPMxVLl>i*|kWFw& z!rDhcRdo;m_ZxzS6?S^mY+k$)^U>74@D{sj)1D5yt*=oV&?Zd~7du!u zJ&Z+h^vflWCq$?UP+*Tmtp4K&uHpcFA3lm_aaCJu<8cuI+;z`ia4!)dyhZ!UXS);> z7fzt)*LG>zW?6CF=LX$`X_;a6O6MYFuOK`aRQ0rh#CsFFc?((4CdlNkP_a(vyIKCwa856>$Shw+54QYN#SU^9i>Q!syI)-2 zBG}BPr^NfkBlzS88U6$#>UB`fLgbbCH@g7#-z>&^Snr$rWTPHbC*4sX?FZT{`WtQN zPD7=TvGMlXd(fM|-?NJfQ1u8Lq!|a8$PG9a!$%$fJ)IhMnG$|Gw;!h>DT4i=;oXlC)= zficAi&u;DyEPzgDvMCj9%5C0othV!9)g-%v^EC`+2{2D1IX&3nX6WmsRONhZ(GR(d zh#{YlEZ3|+w%QLBkp{8aL&Xq`IDFk5fc*{8Xg-P605tYqokU%?dWWyTclzV1bdkqdAW+r&>fTJkikx6l( zun_b~6npy=(Q6oK`A$mBlLV^I+}#l|SXxw&IFzPkn^TN;YI@sidKi&fbuP;E=V*)n z6Ei?=V6HD2Z@KzqkGaZt_$sa0xv zG$j~7%!uyb>FJWh|JH}}*}kP-v!W)&7?4;3D6Y@RnI0at<|v7=+?%w2xS@l}({;vg zt~=3q4)P9>oVabYu#nmro|RX!dMY_pj|;sO$Z&F0`?U6YF@!7p&0(#`TB@B>VMn_O zax<9ed4gZ_&D!KZ)iskljXtff z_Y+{R?4l2PqX{Gdb{v;EG8BY8G1x%0aC&Kx&hrdYMuvG}LL@*`RMfbSC5Jrz2ZGbS zte16}!iVdrsZ@e!Py)NX_NZT#o#%z@?Ce1j3Zkx~*%>yl2OIGeu;7!JXu`y()B8^mc{)KhjB}*agk%%W$I(Gv9d2*UBua!Y2P`Tm z0bpITJTvWZX}D13iFX7P5zkM;i9+3^>xAY+DNIh*=L?s$6+LHySV}F&$c?-vXKi2kO&?kcQQXe zKMUFFYizzkfbNRLzsfLo#KGC>D|OjPKRndqw$b%C{I6YddR6Yq8)FX$Vvn^RK ze!i6#j()S75Ny|Cdy*0P|7;gEjeOnr(}A9|70!Lw>Y91W`_?{vXDdOba7Gn}dB|&n zSEU=1iIL8BS_(+@uH6mLyp!kqqhBqzQXX6Z=jc9qKg!ncrxjk|$mCpl9oN?v#+o%& zWH;2Uz>5E?_RWlg$eoR)!Mqq2>@4@jqTQgqOujdI#O(UbTCC}P6tIc`XdL&?2@1W` zl+?DijS*L)#pXLz?&ls}441!QSEIJ4aXyPecv&rr>t`Oc{ss!VzeuGsw6~_3ZDU=f zEIsC4e@V`$F}=E9?M$^>gDj#*0(Yeovwo0|vij=5OZ+>f=KymchE(_SC+e$L^-^;H z4gGr^Tp6bqSGUe1cCt>mH8)q%cz9a#NV!<3+U7m2p-9u_sCub8h z-zZ7)7eypK#3E67a}YbYnYTU2PII6G=K>IWP4L;zBYJFbUCAf}_*n#`a3+nJg$1=V zjY4ZZPkK-MJX+RiQ82_wfKVOHb@nJP!b0d{kPs+yt77`dE`zyD)xlO_Uwp&t<$11D zQ#lb6&z3F}%x{GSirN#KyN!qqDwhy(SGw+;sb)`MAytyYGdJ!$y_wMaDtGNTl)tMe^Cb0=eQKbWhj~z%OAM@1pG5t3=^X38o29-SFF4ar{0ng2^7BoC)U^+$}v(&bs zbI2`F!1)z5_Loh_;sr~3JS75&TZu()X6~~7M>77bFA~@Y#!N3(YY7|ZkrEMY+#iTL z{&-PGvT~+Kd&ikpBr54dmeCHlFvxWTptv`br-uVbs0V-e&iKv|A*g>;qB{|VU8 zZ@siNQyXX-Vts35$|(0*g<16Cb#NVymz%HuO7jvBqG!`6Uh+Qnym+w7Oe7ToeKw&i zrHfUOarN?Gj!`5)cXItYmOuXv+e5D9?xd8&Ot;dFrUtr*75diT-*t{|vg`T2R%Vq> znZK|q9HlrzFAmbQcdOGKVvGRUy=8(7;tq>J1>f=m?S+m62nl-79WaCU58gY*|KtL}RD-0SYPke$Om*KFyncl3xPACt3XVLPFPj1pO4L&vMfoG>GbRi=U3*<3ulJh^kSQ!o(^82TQ zuk?gIAG9`cznET$Yq>yaf=;CwjK>FXrv3Ldk7>vUO)Fi>>zk+Y{9yqgWl1FR^qExXp;O)ZbqO7ZieQvC@6ib8MyTX2n=2+pvMJae zSWg6f@VoUcgMOyYI6!Su_1lBe;!-EuEcJ}cwBkTxm!C{WhG!ta+p(PJoSYA}5#zEM z+B~oJy&RLhNE{rIq-A6P*8J%k#)$3)p|)f?zJ>Li7y0i_6zZX{j{$v&4C;|Qyd!R^ zPCNU191j)OE#%YUmLuMK-XnIOQ+@iIEx1 z$@lellR>Wdp&?&Q%*?>R#K%sj&+g-lM=$*(q;5EIGCUNm`lToBKVyieq||3u(2(3b z{6yOSo~8B#qiXJq`7@OSq-e%6hWzV?=K^ypj#-DU1nz2{YB3Yrkv{{T&8x)%XqPM^w>TMtu1dM1(jRaC6X`n{U_o4ON>OncQ|1C{-3#b#iKc zjp_{Kfej5qVOb;J+uIvDJI5MQ{G+22e0-D(I9~ZhMrHxR!65%zu-t&Ix@<<}7tSv> zcXl1}OIda8Q^qE~ABTPV>`2Lfy8pvB=+`f$-oCf7DdpvbIXS_*PEK4*JrnZ%(sO}}TeY_IGtAl}ofsKQMjS(Otc}ry| zPqR23YnJE6olKD&-)*$TatJ`MG$hkeGoL(@4$ps|5&QgsyAA>9e<+Y>J{f1PPGn?c z|Fxczb@OKen*vUsVgu9|Wgg5N#PEZqu5df@fA4P*KplxkXnU|97~k~i8GXxpgHHWt z#w)9u64`>K_Alt`R9mw%KkOFqB}9}`}~>bS2TCMQn=)o z$S>6=NBo9{=M~}NXt*!$%U59Bi7gWTtqAekwq5?^J%~PAuB^lKnsM3NuE1XQ9uL%c z<&%;aYrSaGf=lJFjeMFy4nVP$ER8EO&9@awdq#ocOSW#2if6GHY=19$u(RVPW0#W+ zPl55kI^!K{Kb*ta20O$27vE&kXarE5c5|0AT#J$HL48efwIEn+xJ0UNkR-a<9zBqq z{bA5LU$sqBK6}%vTQDu6bk~4d{0*J;?rM=}mX2+Azs%C~c|UFrnEiBn&siGl*Q)!2 zy&1QI?dnYnKnq$%YiMP>(oqGBp+Macw*q$07k;cUj)vQCn_X71xRIo#wkKvw++rRpsF9M9JYN0eahzR}_Q~Dc6J7aQj;hDGUB1r|@wP7$$JcG3Q%pbl(WCE_g z&OKZR>Gy4ImwLbb^j6K$4k()<&m#<1eok7q5O62p?AkWQkI~Y+U!hH{A})j?OxK*h z3Rdr|j`Y>lWdMj)R=LEOc6%z=t6ZJc6h{NGx>)-KZ0)L-^NoR#^|V=i%3AnZIrY|o zPL9jF)}#IGd6WpsHZF}{J()GdIsKR!hkpFXDleDdu|Uap!D`KXGGcXKr7#X(81e(z z_3VK5A=gTCgI8(s4(jz7{APQMz?baGY$(y;U)(KWjY%hiy5%pV>s z6&p~qq{BXvjTSGr3-QejV7r|Vi_sp8e@(70iy?-Mml}?|cH^EiPA*gYtjQd=F_f^7 zWzbUoJS=pzGo8>(El%FK$q#~Rw`Ggb43(v3??V)UF2$=t5bNuBgo zGzcN-3DVmbcN6QWG9(tBIvF52XaP@Oapq1)@?Sp)xq;@#{XKK=KK6C_4kq%8c}fhE z+zPx~x)4HJiK@W8`ULx#1yF**A|A*yg<%`SsMeK=tC0zO7%e?Z55s?amU>=3hWR9S zDr$R)Y0GO)jUfLKmi|Nt$x6ieM2%a(Cy-DtYPT0__*~qfU-Cb2(U0TzP$#jn`==0p}+5g1kbhP$KZ}ja5U1nZ=e52iK zt#g%=plrTs7!3$bIq=&Omba*M4?yRyEs(2#SWQweLyX(mUgrv}j*?9B>i<&q1GVwC z0nb3{{b~=$WwA`qLTThjF1V#c9Hu0VckLbxmm*=m)kk?@v3swf@s~R-Euux zFAX{DJ%Tm$UFo6@KAxw>GksQ*lqCmhJgMDa0uSf=s)$9T+S8vzMuei4s-Cku$;A>i zQ_)7Hk7|tiR?#2!So-M4`fx!DR;PctqfpBL&@?}Zc*~ejd*3>J6;jHBlPrNj_2uTD z*@F!8{`(q~oPq+}TIHG8f@*840tYiiUTdw;Nl6AB51T!=Dn-Az1C|>n^xA!{N3m*} z4cdbZyw%V)-0#ottXB5URv+wF8qcnO)96oEQ7bBs=$TE@N&;3BMXwx?!XJGWDtF%@ zlb7;gi=zx6gV&siJm0sr`7C#JEq3KlE7~fj$D9MTvcKwY6q@EQr)n1oOdB0^-ljGy z^990?7x$y4b;8!iKb)PnB{IBeM?>=%Fu6%-ko^_^JKZzN`%nt6T3xLSJgLBhAg|-2 zr8Mgvd zU84Ad)m#h=h4crYar;^IrtI#KcLoquNGHw00iHD4ZNG*vLK;J7sAg%hzNg@N?k3!8 zwS1ONlZQ0;a1kUrKzbIaT14VB3T({==L?j}xK}5go8N-kKpqb8qM>YRJlC!RtFzs6 zJ(|xnPo3XI<4pu5|EC!AFXAzK5yB{1SgiSSNBklelS)H^{Wu~#EbDQ}vSv|l;<9C+ zKld$*YGS3~;b(u=x<*$8UmB{-%iCifu6fB31H#zIQr*q)NZ-ETleJ@ABhqDgH31vo zm+^2?RCl^I`o!+>;PhKbEJ0Jz)pkD&Ze~zZTj!UCY-X3++$pypqnx~IC4Nj24e|Rk z(+}*n&*yjgc-Nn@AVA-zDd@h90dS0)ohD~I>4#7y>6e_pokG@7o4wfRX_OuWxGz#Y zzv(!23BG?L^|=}uM<}YqgcGxryuY1EX=@}%Wj-O~eV0bMRz^+#e$?k}fPj5N7U9XU4fe!{!eRyd@>GHgzvMcXdtlD5@V|pfb ztzq~w=g)v(R`GtH7po*^wRhXu&T^n6PuS=G`Ymp1uJ#Q&9VC>bdPZX2p$tC1 zwrN%fxGgf5ndw|~xt^_?f159Mqx`O^{I-zEzx_Gz(57S8phUSRgZ`<^gGo2J+0k6p zbG;~dZ-L0pw=Wcz4W{WEXQhu%jW~Iik+L^Xz}x6Fy|y?M!RE`}K-Lf`Tgr2xVqQ;F z8v@;(s&9k#G&J&yiuH8PCW|^Hy!#rfGJgpTg+ib!4Fqy{T0xBZr6~`)tDSDnGCLm9Lt_nT}P-VP#54?K%IL})$eS+9T6~*rre|^a>Vt6s$zIEDm zLK#TG46}lJYGmTKb3J}Cc-HDNrxM>aiIJ6(O23%E*V0mGjL_Cx24b`eB>SK~%vv5C z)FtNEpY7rIcC~EQSc6ezOnOgS-NSD+9a`yj~WFDk-pE87~b@5f|&h?F#p9g96 z9?eI3*xcf4%<{vaUl!CYYUqi^FK!Q)ZukYr z3Bttl?jR(XF_%eCH!&Lk>H+7H`%_H1QxH1~mPRc!6*gIzW^!-Sld+W1{Q1wN0N~s{ z=abIGE;&kLLCm2nPxzLW>VcR)IuJEe)7Qmaebtg;d@-r0+rnI3$7 z*v|yy>I6uLKi1o6yv4BykC*bpNdFa+hzJqTXCEmJa4NF1O{VX>)9L)@x+nr^)68I(=Y{2mtdG(r(lna)$3Y?`xuxE3Rt9snS2psy3cL(!I~?B-d}}-%i_0^ zGTl=-!>p;V{W{%RX(LB;@x$Xne*5QU8x?XEfJ!T>Qs~!Hs;}rQX*Qnd*7qr6-6W{m zpEZ)H3{8ywQ14`7Zt||lUvDcd>c%X!tAQg~ID=K5D;)Sd6!~nuH(dp<>CbcvW2x04 zakIXf4*G#?t^b3gQGHu=^!hqt5y}EY#fJ)K^{i&Tuk7N0-1CbJMgso<369$d1_PwUwllL{}x{#dpNU<}4n& z0vz(Bm7kH?%TtvnyYUD)la0$#?) z9OOSvwl1cePb}hY8!W<1K>TV-(r7n)*kJ3_iZM9ej(}k4bHuD_V&wpZMWM7fQz_g6 zdDcJ;%*G_XL3i3w|GMuB@9Rhv33d~xGE;t@Lmgl{%*_v3lxps*`G@N&R`I-FoDBm^ z{VG4mKd2SMJ!k}2!x*#_9E5R=<$_U&8~zRU@i5uaks6JK@#(LwJK*VZ)5L)KBY++ z(wXVi_R_R8oYZ@lXFJ{1n`-779ifJEz*9v~r~aS_jZBqkcQ4e*!^p<>X!L2dY+DNO z4{hb;0UTyNfBBl6n)yo2D4(t#Yo@dPzNT&GY9Jh6z09;IHUu*CgpO!3tIBl)R8g?o zJzsHkMe>)XO~?k?S{JJb+ggLrJLA)d8k?F!)Nj3~V@{h<16n2~GJWQ=-5s-UcV}j> zb~VvwYptp22}9UcBnt!I8r?nl9oiZq1D>6ow(~p|DjvS2=H>%R4cCEAt`Z@FI8>sm z`@se_;i9{uq5@h_*xc=g=c>)oZ~i~_myF+K?0c%RLU*7qzb*HU&X2834p!zSH}&&* z+w%Uh|I}(Pw!IFNQV4^b0?*uC?)`n}9*~s=qsEl<)@ViG&CPDSEv{i^R7Q4}0bh?_ zJJJr_8Yo81Oijn1Zz~LvGFUh@j-f7}-PQ7+cS1PA*Jv(vQtMBbDvyc(>gq2B;98-% zz<)jH{tK@KbcP0{FZa8;HKPXK^Hi#OyY#}n33#o4)$`rqKyeg5Y*$+w-Lz=FqpW%c z=vn}LUum~@abH$3@C0F9?QCS;J{^_FGm1YIJFdl@Tsl3N?wN{8bpEvYx&Z%v@6vg| z{^8`?-3zyIfsuJ9YBGnna|7OY{*JX@yJG^#odi5qy=Rv-l3v9##fjaPtF@H`Y`k<& z*MhVMVvD{s%y|?j=ZcWGeYxyH=pz^^@ zn@1zzQmsYPQExvW-yEo0@Mhm3o8yt;FazisnMM(dzinjzH%*=QT~EI`=dgYSe+nja zuTKE&V)Maza2a0Tl{ST60*wxChLQ`x6A*H;S)Mv1uT~vyH@ACp9?s<}MT-nHdm33l zz_}fz;c!-eU{5l8Erm|o#el0hzPxdHcTE6zZP>;nDac7-gx|kCT2;XQddS7!?s&c8 zeAS#<($6%wy;#~wGL_d#Fl&p{nZrs(N`)C_83=r z4aB{z0R}97W#L4lLlvL}lQA1yJ99enDz-}(^YJ_%61ixF33Ri;FewINVaN8Yp1)4x zgW5jQwj4CEm%fN z-sQXQr?AysEgKUv^Ujyu*{5+Jd3o&yNRJIKI-I62rr7ebDwmRTE_xrSvI%x!>+JDY z=y~0rcOqZ|G=9EMfwU;l;g63>tiDB8X=-9-0E@Kd_vYIHoj?e(;NGACoF z(Lxn%(8ggsl^en`>XPkwxgE~2`_eo{7=rv4&`k~1&-0X`vW`x)NqsrMbMrXCL&K^T zufg=n<}fNb#}8xV@Hu?{$?1rHl_@(cXDfar%yq1@*<|RkWg84iJkP~O?_ukT#Nf4k zWN}ffAqT>Kp5Z~*d#-21Yq5BQA=5{Ay^z zc~KQfBgN8hedhYX>R=U_?+lK+$m8VswEJ%IkOV(uZ;g+Vhut9r4AQ8+KA-Dgr!1uD zhd|dG%8PSRLH}CUt46e+N~o6Ftek=zU$Y6%Omk`LZGTH!GP)%hIC*zc_nlAPCE+I$ zbRh^yFgRjxa$l$zuzz0oZN!5$f2v69+P@$afYZ&;VXv=c!vdM~<$8_4l?0~d#y!_o zusLUG&9$nq73%OwOw9IvJ?cfHdR;T!|0*RLh}j!6mgVX#eehGSkeBdR6u+z8+D|yI zFdWEa3I>KbGu!<<4UQB|9Q04{mk3Rj#msQ&zzSh**P83}P-LuIwJ|c>AU2qDt3cHFl$o97 z9#i_oKL$f0G!bY`=gJuaOB)Y5Q(+Za&@ zw;kM)V7!RogWKg^6ij5r;US8R022+RjbpvGoRq%6rPs@Bv?igne*K{;owNrUejnrC zw^LQnZ}#_ZyEYe}2vV}U&s31QdTKNV4n`rHdjePIhJf!vXY1n#x5KS$`eN^VBCavm z%<){s%J8`5OtI2FHsi->QaT7PP}I8p18WWFCnMWnc23FRoXOZ@lf{7J(7vYc>hp~I z4G*tPW-#j^3_`PZ@UNcwpfj zN|bGKRVJ9`qO8g7LOv};9ZL2nib8MJC7@;TfS@YTcy zX4`-KB&*eA^u!J=&blHZ`{ieW3SSMpZou5Uqg76Klk`Q8~$DiVzUbxS5x9P-Rlh|tDZo4T3O|N6l&|>+Ofh0AC*dm`PZ@&{;o#R zJ9Yme(d}nKDh;6uND-vCh4q%lWYpTySwKkU%(0F3r{VJsMYO7#YYD4}M-aRI^!R^Tt_ z4>uOZUtR5{atjl`)WY>@i(P+qx_f?NnuQ{M&ptDU1+StmZ2=gg>ZN`p*)!eS6`q3? zfn6(OG7yo3Bq3TT80)uQBZdtG)}lg42v%HFDUV2zUlc+`j@oN+9Na(9aesZ-^K@rM z#INoB=r}|n17ouei~{s#xu7vjk?OpBwbZ0Gd&s~;3vy$+F^Fhx~W@p)t?~C{uSQ!yvJyqet+K~E@CCA zDn@_{=-IM={D~DPA|~m095n$;;vD-=nxM+=myKfhn?IjK!dpq{qDe+cnoKC2^l0{-deGu-)b3zO@%K0% zn%eVtCj*SV*4!<-^0)`L%6xWuf@n*nJr|7N8*V;ooTc(x4co(Fe;M7$6VV^ zMPK%fjLdv}H`_z}v9GJ2i=%-okbOM}XO`W;Mfirk_aJ)Gg6uYgq1h2~n82mCMU%p54_@lo}veJ;Pa;cn$P?zy|C4Mn6>o1x6~ zV1IoE2DYZ+*$XNB7!M@a@0Z$Q029!L$fM*^*(8}<-s;;Ti%ssinnS6!v8!lWoirbyo(%!h>5v#+D0CR< zXv^T%f&&A8MVLqwp;*PGO*qiBy6k)z($8}(^m%za;LvjJ9v`h)AewQVg>ueD6t5C2 z%HXb#`w5D=&o8Z%gQaW2Cwx8pc-D%F&SrlUJgNs>LE>kS3^p0CV{o7D`td&ll4PVs|x2E)&UoHPPABWnw& z=0Z5wtAs2|edD8=)yOXOohdyr7hcgPt=->+AU&`9{7EOQWRrW3%)2}f(o)A&1~KPP z1ao=yK4#~>2+0M&IfVG85vzccN_l2JG@RXb)QV|orQGef$P#{aIor_JQ~8bxf6DW* z5y*wavSqZT3pcg6K>sA{*|f0ZxE+PGbz)syQi`0NtTV-A|u!R2(gL5|&FSHU6=l!+>q_}aI)s}IvOA11w<8OSN};UlX) zw**RBpjfvLkBv|jMkKFhG2#gw*tP+#7+UXyB~A<=PIUXG`MsW$(g^$S z+QS0hZ@RWtIE_PXpprQ2r8GxWXK4wJ9oc*b0W^~E{`gbGXI$g%nyQM@R=_v7MIxFt z$%ByKnVWKOemIl{b6H=&mARh*R&9s6d1=MPX89VX%9zJ-|L&}coND>TcNgS8SK02V zr7RuX`{Q`Sz#w2eS|^qeT1$&s2t4dL1)hb5M!2R%(ZHWBkP9l5dCQqQaw>yIN(!`R zIx__{%cigyxFHpMshNoB<*h<@w<{^}uMolt*TbOvDUJg*+569iDt>{oubswLu&)An zNlQvkXu8jiR+cF4#JqNuN9=pF>vy=bB@!C`wG^|j0_Y#9}c)d&f_vvPhmzXx8?i+)ZTo{eFfS;_CqbtD{%)7jdr+Sm4YI9L_KAvK- zBzyQT#_&%6mZ-BL5Q+}wughTpgSV!2oR7*-8^eTRh{=ILsr9zi4)x^fm^aE_G!3fK zBs_XUH%~Y1yt>)5N+6}4KCHU_NKn;D>%WFN3fan?=tI| z9tzXM#3h30Z0gA4)gR=H-11h{=_wTwm)Kyexl_?xFe_>))r|4ku*qNLAPxSV9ROusO;gF)2&7UZo zbJ$|SE#l%wpG})6Z*(F6eV1`@B9y>go|T6frNvVW&lcC&QUHNK@R@5}V(GaiZJmnL zzNqjgt#`n( zVf8k*IFVs%OqH?%e=3gUaF2JNxxb{LdEVyUoEO}M4ist&O;kPmGX^)(LFQrP>oW0#*G@H>-#|j3ie*sSn=Bveu zy!<{!H{5W%ygew0lAJe5)SmLu2S-I2>12AkmXql@EJ)kMvap=iJ~5UfBQ3-yEZ7x= z41=Ex!%Gx{{}^UZcC0XQQ2*>lK6%lQ1l7P0`QQ+_Nxq5Baw@ZvW~18NcQ`p7Uq8j} zf0BJ-zbt>SU+r0qUod9{jOtdJi^Yn$g}&Z?n2ohHr26ZN?%s~PAvj>dWAvzj8wsa~ zfLnz=V6uVhIPjKU}Tcw_`&g%Epb9uL_%5`ZyUqf zfV)4T%gydMdMZ2xz30UG0|pe`&%AjgT|~KZV=^}p46n?!uY2Xph;{Yk#pvrtBG zp`FC!B5fe)Qe?Shs7L+)958+|gclnonU!EZ{d>Yz6q^}0MQI~PBTq49U8rZO1mTzj z^pjZ3-%!2k-__W&RUsTS2E^5HJa9Lk?l;1plE+dsxt?i8Ljt+6S0p!^D0~fJx=wVW z-%0$33WEfSyntM%-h910M1{ro6(Gs_(edysPl$2z3_&-_5CKV|$1KI9!4*rFD|-ZicbnX5ou#c!P@^6lZR7M`( zS6=QJo695GkXb8caVjjq^S>VD?gG>SE1DG#T7WsJNxZ z=t)*7K=~^#@aT3?f!{CCJ@*KL#@?RH%8R4Ax~M+#-6)6&>3=G8h^O3iMdEH;vUwUP zQV3)vA8={FxEg_+6(a6M+@6^si;wG8%G3<9!CpOH;f-6M?Mw}<_Ua23OZHQcgvGRo zDW*D576zJlPSavgmqT5tw7I_w2P@@4y=`Eo|B#eH&^JoR{CpZ;3<49qo{K6maIy-W z@tyR|Kx;-l`c_uj3to45m1E&&psDHXEl|VQ-bF>kcaDU_|IsS7H4dEx^rzr}!r0UY zZ5ly=TWm%$Xc9@pc29~62`d$-y^97O#zero+%l&MD4dJbcTgw6U~XS90+)k#I?=Wt6oNb8`U0f>aDq&3i- z$temt2S{)(`@f*`ShmK+eX$kM(xOQVGdlQXYW7=C5bf1%Gy}m2-;|!8{*y1%0t81) z_nDOM^U~8@zHiZ||L{1HcasoPG3jiqcF*@fOgHm1*{goh`{;{se)TSM)|xnE8;o)< zSQk1LZ?sgp6-FDbW@KE8#BYnf4{`RRsw`lyvR`Y-`>_#5LgZ5p}W7Y%&6utvNLYmf%^dMb8ol{jt8Fc0R5zys0Vdb&bJEI7H^fra}a( z%Rd#>Qypta9chn50wG-o9Z21pd4SRqxFUF|$<)Ij&#HhRJRMEjq@9nK@CbeA=;er- zb1kHMptYcQbO$vo%<-FRSdz5LbXj&i(B=ef!M1qUQ(Mi_x7|Y_IOMDh-U9Mr|IZXA zCMX+SA+U)|sx_g}N=kVyr+ffh>zb85y27h10wIX&r}p+@$AR?Pwtq_b`sH2s2`M

O9cQqe-r40ZI!+3Ug`9f2FYR61Uxg8XTW|tkA4Y_xr@|Y1u%t5z!Nes zkgKwe19<{2z?{ZTQBz1yB^?3I5ah*4zZk7J&(3VDrS6|hO;;o&Sz4@vY?r92C_Kkc z!&7f%sx8fTMISK8z{XK$CS47HwP5%V+s{4}_-(}j|HPR81+Zin!ZtfO#wP9W0Qq+* z;K^=Zdr8b9xYazlR2mp4#u@G0N9kT%N*CVoP@OGlRM}Y*wMkDLC@{tTaf-|t{{y<* zf$D67^=C%yiTgKB;GikasmM%CczyG?c_X}__|(1(D@#V&kk@Mr=jHmX-&uiNR?Qk) zn&0#CZTs!^DsBI4CpVsUflFKTpN&BL!4G*&M@stg0Sk;#d|<$qW;8M^R6$-gFH%NI z<_34w{O6*8QJ}7UMG$nX{AZZK)>S1XeA_hE;wl6?=<%@MOyV5K?}6VdFrQjkYp2D7 z2F7V@X-fB)1B#h1m7$mojmP!of;l|v(T(CDUqqibJl(HW1!+jWuc|S?b_8yQM26q; zH;+)A5cT8yzW>H8b`-ZVDDY2qIu6xM(`8m?>Zt&>p(|cckkZn!6&6OLrB#odz=Shb zvt#Sx!flra(QT?o|G;luULjXCpbT*V$75jucgUw;DYCuOR+o16*{D}OcD8mFZ~=2V z@|AIC55Wzji1Thugn<#fT#OV@Es)&IgQlQd4+#;Sct>s4b2p<#gAitspUZ^{%jYXi zNQlS2e0b>g`&6&j($@0#k;6VmcKB3)x~#=JW<)LV`Rw#+t$HCPjAS*GAV=`qy`|!i>>FvUnfW>kq^d3*WLJZ z^Vifiz8iq=He(c^q+z6G;UN`Z{m8*bPD@8tRQo%R@Lzo6yZGnZQ9DI&q3VW%^X6lp zjQ|b1sXGh4a}$@_>ir3s=t2g(shp6q&j?tNl>HY8#t;qFo5e&em{S_Dzn~hs)|XI(itsce8qd8Y4`?_vZ@9lBYZUdEl0K70QUpx>Pd#${+pz%E@Z00o)u@qB+Y zmNedf^&l?JTc!8(v{bhn@UF?p7S$q9otlS@jasOHcuF_QZ30VVj;U2L*}MYANaxBqhF&k3;{Sjz~h6 zb8$gpXQy}~OitDytfeesR2dQiAa;pkNntO#J}=9Cb8`cupcvzSKnibC2zb!w%zGmA zc0*eeBqVp2r^}q8J0@?2H+?TECaSCcb#eVn3926}YuzJq)o7i0T8AUz;?Vs+SCU+e z$O2$Vo)|Om1Q}8g&&r2W{c4g8EK+$s-H?+5acjoj9wOyi0Jk>Q$uBWhqLe5pRn1j$smaQxVgpsx~Nsa+Uc~CT8qr(S>+;o3*&-($!jGm7{ zR^*B=P>fm2_OFZDKUx8&qoBE06eL8p;#W>W4YB|K)b>lI$~G%zv!2gh_i@&3`piVG zV`2@fo7Y&ucRBym7bmAuU?Mg)2BGkICXOw6J{t+5EVf+{JBTQasVY8^yH))IGL{uCS4=y|%^P!liah zm`3~6CjGyUX*(3L*AoEWJ|P~pyPq=VstcLA&!@VzZFIcCZo%hLN{vni5WOEEie{Us z^BI+i4TeWEjZr#D%Dw)0iKqU8dsoNikRvO6urCQK=ePYCRSH3az4_(RJKMqu<+j~+ z8zvO3z{PxH*a85S6{o^Kh`{|58Y!VH5s@@d2zi?2(v~EmKb;qjzoUAU^0)|f`W9Mn4MS0^4qqYY@Yh8;`=W0l0^?+ zR%p~D2sCI^Z7^_{L+&os(=Y%`>X~BfB9;*(Y;{8DL+pPV|9j=#3SwZDJc*5R^hqC1 zSIvG$&E=`MdM)+1cG1`N{Q;BaWniWU?M&Zu+?)Ie2`#gUUmT2+|WgroSDwbp_@ zner&uM~)+AR!Vxj7i}r8C+AFi&QVm8lR;DwWToI|Cn5hy#d=L9hAH$Fy!|`(5+7ER zcYpJu0|~^JyVeWn2fl^6sTmIQW4v%H3@TE;jdZYJ+~++aFus3I4&wigaGOn(*%d`C zCyUCqr5^5g?BLsB3v}m923P9+aPZO--B-uEI4_mDSaZN~Fq0M#j)Z z&auhK)z&7)Ca4loQ{z&8Vz(hcduFX)wLDBho>$P*?MlgzHOi%j{)=hOqceY;TJdt9 zi=#>N>JUUtYj-;|w3dIpv|UZC= zvSYwMTl(sfRlpmeF{FPnS6j%7T(J(yG9uy!lP0tELM2LUY`=WeT1T3SVzvD+YAh+z zXL!RR+E!?BOD1tz(Du;m$XLwA_m;UdiMr?G1-_$vQ0=d)vQlD1jvVHJ28}8JfW$>b zyKoTK27!O+@dZbDd0}%6*KmCY4Hym`6qkee8iE0o44c!uLo#E#AWa%0zXb_Z&rO#c*aL`A$_d$2nz5a|Nfu@H8*b1a*#Z++| zw17I~e@H?8d2v4gD=96-xWSWW=kV@P*Tw**JiVbVqUn;RVlV*{8;h+?ZPv64R9~O4 z=n<1Zx+W}Lwzkq42wIAXtBi_(voU~*IAS*g7+V|t`*1vCVj3DhH`{^N*)6fZKde|7 zV$-+-pmow<_O3q=KigkT2avj{)O1z;9IHvHHrNee6pw4zR901%{u~=u8YMGT=ONJ7 z`a`$t^Vt(;mRKYU*dDy}Wq;$h(&OESFZ0E7p2mo+ZuJ2Q0)sfSvpNCCukKKIoYs6U zr>iL{g%L-KX!__k=R*euXL7GM3i+f=KCX|B0N?pojGe{50w^6Au!+Gp9KrLxv-f*r z;YwXpp*n6VmQbTict2NH6L^t*fAqts*@!D@aw(#aQBUXIA^ISHCR|O(jf=}8B6^;H zN2C@P|CHzS%?RK296L)8L*zHE*Tk0~*HEvbr8c;^4YcmQd>*`W} zuCEQ2)&y+HIahCV!E9Fe^&!pwcoN7 z7%7taNjke>ckVq-ZR*jOgw0*KAdABiP9iT8)7z->mBj2~m2y+%{qav>XwqHg_dbS$ zveW`&2VUQ51$kLwO|YN$Z*6#-4zDYCNIT<4OF@yT%befViq)R7ejVxj>jhBwt}z`( zgCmhH7w_-r3w*XH7+=N0X$EKKTkrL(tdYj~T;aEDc8en+5q+hm(ym(W<647)I3{t` zO<{2YS{eOl9`R83FqDHtv>;oZNvx^oqDqY`St z3+_zx;cwqreaKrlXaG9`cuxu0Qw5Y9sK1E^ZwH7{c=)G6S=?qrQAV}!%_@&OM0WOf zO;%&*vBh@o^b_b7q`HX3GSG(7LP^N9*^nquvTqk8olfLpTy2`X!%VcPKF{|7cW2@m z6?vu0tAm9N0itOzNS{s?g%^fU=RUI^yI8N@Z&v8^=K_`8i*_S<;G+0n$1p%f`L-6W zKRfLw$sdZngG(1|{iIXXnf3&hI8;?A6ANpCGw}9OUY;NeLqeidq2KIOvhib6&ld+Q z@hr&IqGbtpMLi1E^+|K~P`h;yDXrdoI@cl3l+Y~NG8PA$u@w6DvFMhGWStO;-3qm~ z36zrrYEb`+08Dq z3moNjCmr+?8w=N-n#>$Nnd7ns!HrvNRG5gG@p8;)9sTq+#gd~hECpB1ml_L)2f^mF zi_Dqgk|YK&PQ+K*2u-5vKxi#-FbHfqMXCk*1}AEJIY6?}8Q;4uawX%Z;hw)V==`48 z{3k33L9mMmsYrM|+@?5us$ire^GV}tc&iPH3`=W6NQ;~@*k&#R5Pmujwm+XeJX$+3 zqpmCa3N~&Jr$|Rzj+8`t$iw|G&X0|)W|-N2S$m1a*4W|sYfLxR2LDtrb@_<$eejze zNY)Zh8wcljK2t_X$?&2T+9Pw_OU_1LTH69(7zjODr$qk;E}EU%^N2=fO28-i?$)Z= zbxPTwtK>Ttz9Q?iXC+-9uOsu827_Oua3>Ecc8`WULby+MtB4^>3a5tfr{|B|*Xx1O z`Q+JR()_(r7ZUR1j^fl9EZ%q48$T6uQf=R+R5Y4?&eX9oSF>?gf3>j4&LMGBw^FZW z{h07Qj`r6}cYk5w{2u?#`rkwdK>a1_aN_ZPJARc@f!jIvfQLmo)=AOkK^OIG^PE}q zKujyEESS_V;3Mf5uTIxg77;ix_SXL~ zs>M&%2?aYBxPsS@g_E*4&)LLTXJ23MG{32VDq2iI0L8wlD{ykAd1eoKuZ>4oifF0nrNpQ@Y3K__X_E5NCB>*3-CZ88 zU=831QM?PK02rUZNhvP}Q2?KZ7JmRG2Ise6A>|PKqiYm8q54@|vKC4M8FluWws_=o z;rc`}Ei|c^hLhWNdHE}e>tOLC^VyA=gmw!ad5fTJ(6DS3bP}mJcW%Gf50rFC>hrGK z7Zs%lzl4PRioD$>nOG;UW=!1FgOL#poG|UVr66b`sME-BDyzo>Y}vAsVw#F2wkksm ze0SLY5-l}D@$hgXnJ=7<=Sk?o6@g2=>=W=GwOFS70sq8Yb|1L6Icus9W}8_mbdlnI zdrs3uz2qLY{9?o9fLdHJ%PXO#lOZ5Ee)s2@fu$*-9t6`p$4^NU5A z64l%+vT?g@C6RKw2}Io^WvsB3`3}bo|$)Hh) z*m)3+;#QqIsmEl=itina+g1rXIXuufc2L&TQ_+cB6n#WV4`mj2R6(DKI^|P<%|YnF z>Z80po-JcfXwQ~-kiN%^y@&IW-}Cs0`Ok@LBl7tmu+zri{@pzc+Wy4EhyjpRRL;1DwIci9#eSbe#5L;iSgt0_DoSdyRp?t3BS7#CdjuL!sxxKI5D}CcQBPWYh1o4 znv%3XALi}fGM>!^3TE%y+s){zc#PaoGLghP z?wT`nh9U|cn0zLI@q)Qh33U8{ov18X53{Ow$ffgfPlgdeD~ z+xs4j7vK?w<7x{NKMnCUxgs793JiiFXJ$yfo<_l`((YdQe)=bVE(2=dp4>>4!&;GQ zKa?|n#*9`9M=Gb|*{P`o@>%ae86^v4cYSMcyO?EL8uX{scV=OTkrPG81N=KBkV#|G zW6gTjb=t`n-Ls(pUQ!OQ#X@T}?@G6VM+pU0ej_FSo`@@tuDCy$~rW zol7*Fe0*5bq)3zM_*oRwrqwG}-486zpY&1-b)k11pYtF*yewmFu0Y0vZGo7U0*~)b!Ij`hMdi9K+N+! zPYnnY-Kej>jsPD^R^j8vl2i9EZPm}BMwVP1EF5vQwbRMN#Km$-v6mY|3p1h&%ePCO z*EG*mPdJdwacS|w$)z9}v>K>C^0Iqjh%e_CD5-si|B>?bwt%oWqOH@2(&4b8^8z6Z zRW%(i_x273elU`zFP9VO+;JPe-&KKfJsBIWmD}s7zYVa`)aFAFCh9$S{=$JYK2S$_ zOAjO+M%KdKBwTa2DF+z#9BU&1i6Y5IpXT7#Ipz1M!N7M&)QK6(s1PeW^>E(z>(qs2tjp{h;ljIJ?}QGPvve9oeoF} z`+Ez$Wuc-G^s;?mb(!y9DJRl3u}+x$jPyYQ4?ZNfDoxge-=0#tF#V{!)J>y6rRX*M zIcss-aV%bYiQm7Aj!^!Hgng~s7jm)ELrPA|(m21)L@^YT(lR_*cOwUY`e|TEI)xv_8QZ@#AYVLzH zZoZN@d1SKzXc)1O;#rw>ZqZj~gBb4ahcW4Oa=#>Ca7PKS8emr4=DtPA2Ns zITRoOi?UokyooYOCbAxGM)GS0-zXPgG#d52L^S8t9`|ky1iDNr(JuzFmP|IHqF-4Z z#Ohj|j!VsfDRpTF2Wdw~X}8bDrRA$lnoGsi(OV?>CIOG?Y>6MxXu82eQ z4Z0FB(En^|#TD4iy4mI1s7>#`g9}hs{dXyy9Sqmp(&l(@`nXAX=!l$opp!_*V{53# z4=3CJZiSb+z&+^@7#BCqISHw|mUbzr<-(ZN39WDk2!Wz93mZErT}SFNwW&xrxLE|m zHL~cdiD&&M9WKG?TvzZ@Ks&2Ujtt=N^GM?dN^hK51F&qqtgz|IFiTo3=kSmB3`%mx z@)KOQq5*mKwZoQer~JA%kwILF(<=dLV7w$uL;LwqX#bX{lR|%{Dk=a{B{$*M1lSv& z#tN?5n?>cGkZQYXSI3pK63CVl!N;JFmZYZsO%4)#Io%MLQa|Ts0^d|sA8(DR#=3rH zOWsGRtG+jcA7b^QVXa|?c;}UPte%MHMf3RBAm6<*X={h&6b5KuNe018O$yEW!Z9SK z*7Ep=egF#WP*Gw_V6b9kWr;Ma7VhqbY<;2WF(u+t8?jvTDTT;oGGg7Dn z43c}r=lrOqdObIHGds5>nDZGTEsTJ$tD5;D{W9wv40_o|R<`&(jeS4GvQ_JV$9xo` z<|&`h(65$;Bu66YoY&tncw!g{^@R%c*IM&L&W2zRC&j+7oP2U4b4OWAD|%;>N;AhA0kHq=LM z3UXUtSnm1DZ$9mx>oEZKg;D&*Zvh#GFq2v=tS%qEM#hf=%EFSG*q&g@d0CP) zXU%G&V(>^n$>8dRe4ZHl{SHSUiR~_LgKb$dOPH-7U;IbKASbH7yxS+HA^lqWp{`w% zg5Njh6QV1oo$mG-G%hDUea0guWZ~)yOm^YmoMQoe<4*~&#fcpq{2xjHBlP6XF9}t5 zA>-HdM&9Q`)iOmkNCZm(F>GysCb9QQY&&UqMT02p|FQNKPL}O zpJ&9(ae3U1h@>YyU+bw0g=BF7SH1hHOGm^WO)GyL4QSRokdw|%)Y%%5 z(NKVc;*Zp&uUsONjQF&pnI9A$VLvfY11vn>CWwNS#w5;i^NI|w$GGG>8U`8?(zy9R zXROd@u`hin>>Dw>);|G(!xj|#fBGv3ik_5Uwocn;67ehD1_a!z5GoE24>xv8^7)7* zK+dx;0fXu1&~Spo!!=2ORWkz^`0Yw#*NjShLdImv?Rvd+Qg`Sg7bN6rhQOOuMm`P} zhOY=LW^yuqt&&Lx@H^i~{_*$fYZiCJrk-o)8rB&;#O%-YH zzrAwi{o2@sSZPB@iQ^X|6ZH6+uD9u@n}ff`HD`Jb+{1Y3aS%8n+BLN!E_SvHZ@Fws zko}!f%vQU{O<#_(xBh%MVWRvyR8)W^Vp>p#!;E-c| zkGC1jS_l!EsLJz>Yi*lUEj>v}O$woCaia1uLdU7}lOUkzL&eDf*nsugie;|ae%9cn z{bzdW_Z0qXK4*tenTgG(-#qL029|fna<_cBfd|_&quyA-rN=7UkBmuIkrNwB&yRN< zBbeY5R09mPm24X&BWf-uXD!(ejsC;fo_&#g+V_hUNGX6(+dXuf*TH_xm$FeuL_h4QgoeH0Yf1$exyM z$eff61iO7IRHS#}UzB8I+2T*$^t+&hbd|O)d(iMG$4?8L9CA&yDeN!vq$^Z2+rzNk zCO4u4FfZ%yd8fX$YF*5i};?^;Qf#Q3T0tvd&yNKi6%sI5gBz zw2RHwR&t493QBr-==AOZMR^%h{q+*=o|nom7{MTP!bV?eu-DYX9wb325G)`WrC6`1 zqM{^UywLUjA>Sw}Lh19f z1oO;gwW1Ro)rQju)%M%t^c*h>x+;kZIB2xKMhbHb$K;Oo)hx7PusDqlJB<25X19I4 zo}xFNl22!wKa}@-q}3+a7+IL>$VP-c{h5&+?lInNx4Fu{6wfx^zaH}VKKngs8KCb| z2cM2+q{Q5BpSmupsAG+j#e5L_RfA%a2PRC5C%R7fzXS|_Mh}A4r0c8S<8F5RB7{*E zPG|yoU2`2y5)!;unszaNp1fNe7{J7sF z(cccOlh0QcrTM6uZWiNKy`P(=rjWYLP*wf$E&X@pF)(ot1^Dg}x|ENx@Q{t)@X% zcK0R#L_`+u(ms>0-S{H4f36j3M6;*lczv9 z@lQt(r1TA0X-?YTpRo9OaKOXIXU?TYOhq&aoUCn=Fzr(erpcg79w6NE6%0%V8XDE& zotqnRAQtcac^5^%c+Hz`+J=*s-bO_g9imVE%EZ&czD=rB(pJv+5E*jk zv2*OCF%~y4NYm^@7^Y;BOZM_S2UU(R+=p6miL46-W4!cbwCg?{7-9aFS43-3A1^DF zbG-4K0O2Ef?*ST0fNXbAf5aW|k&=t(aJ7R1CTqPsVQRkLCI<2nA z>z_|Td{}^hw%9ULjMC#Ap1b-Pmk4cwj-FblLyzGk1ky&y{=x(Q5SH z1wX0JLj9(lmLo9}x0%&hb3kEp^G!r4Hm5pji4gI&L_-^_K0pID4ZuV+P?1slUiW-k z>i(@bNzv!DgyPIB3q;CA@+xw6TNw8_n^?A=*asKkkA$^g899gm{g^hNkkW;B-c*vi zr`=`Xdc|-$_;Qvcngd)dkXCX}r z4T?ooq$(2RN5R4O^EgN11sS%KP(AmrJ94q#WRsZmSk5-EwWb1n$< zBMXyH1)1~c;!b@}X~6h6fXrR@I!~`XFIxeZc1ln081UUUg+oAGtV)ZXj>jDu9c$CT z7;+Y*@f{H5Hd)(%6SFp2v%^ma*zY0e%4Ksy=;AR20NQAIB%t*Cp3)KpFpOR^!^1~- z_p-7&;^N^7$v&!fFt9LWdc`@@Ev}bd){Adn&lHrDl=jh7075{f8w_A}U(=cb&otEJ zZ6hK`MTnD9zY)Ifc|4!-@sHau&O$VFCB59G2iD4u`zo6Qqv-0oP|mgmL-81T zb$MxDUx6ug*E1LMZU1@iDl}iEZW-X$xr#9#UygL`e44ZZmySWrdF850nlA{mM1(Ip zL(F9T91aV)|KR36DGVBIdQ0!-dhvQ++Sn-BDT*r$w2K|7lhJ?ioPHhu@AhKsC=)Cb z_-xjMYrr||F4Qb(3P0c@t7Bug=ac;RL)PCzpo*~?!R54n5a9T7AgFD zwgq)KdH|RBS@GI#I6LXr3fHl_jlLrel&>;=p4YROB53E4K%2>(hwCyMJIInIk-$r1 zUEvW|J7|6oze#1X1ahPMfr%)=ms%Zm1b9LlGb8QlmK6CWpX-TM4XOCIn zeHyxLY3dtyOlouyj7#9y@?oz@j>5LrFn0KA=#j9%ch60|)B@7}50 znQxSgE4jSP*KPN(3S-e%O}~ryqpp=%44Yd}01XA@g#ZT^@ihn|<+Y^ZkS+_TriVau zqPFP{$JcgUQ~=5+{Mt$%#1TJ{YEV-H`=?w9{U2)z8B_KeVDCXuZ;qwLhKRqkH*F~8 zp#Up6PDVVt(_Hx#Zu&@FE%tUzhSzzf?ng|f8GutE^!yM|x!8n3WJXS$@0elsh>Y1#W`x_U@Aww(CyjEGG(s?iyAQLR?(JrD~=n znV#he zFxY|(z33R$>g~5LA4}ob^YeNtlI9o~f@I(7>o4{6%595+Geu?0F<^enkkP3M#@pR7 zDbt-0pyvJd&H0!KMIKl^%Sti}g`v67w)e`@rz!Z!RrcAdS*F=(pK^e6#D!rLG9JEa zb*5SxpN$;bFV2mf4Gke(xJ2+Ab&V>!|4=AZ4w5D8eosj~vSnca{*MCi`Xl0Dor8BH z&7bD?=a%Rn9>gY@`FeheXv2$mFKUuIJT$UUv(ZychfYof+VLdOpzD{xU3Cv+;=JMR zn1P}(3l0Ch0Oydt(EF5le3&eQw|b6G$6XfKMm8u0uCPfqKBDRmg%)i0$-(VDc<&i32=jbt$;B=z!auXWXI<0vv~Uk7 z0ZYr!Tu}@RZxDEOE`!(qxy;R&fFaSH_Z*>XUr zQPsfU2e`c*_gsy!+y2)CHVnv?&`1n^E_ zoQx4ZEg{)7>uVEE02YoFQuN@3;V>t)D@NwJ{F~jg0_UST=iktw|z% zzHG@;paaNDYr(R1LVNBk)O<7d@_Ky%GJGRMCXYLNL+#T~&oplbeG}MnXcanErca77piu-O=+t*IElm0-Rm> z#*@U@*g6b)-sih0nhkcX(y}aK=!P(Ozy;=!5}(#|?M)Ug5mm1{oY|KI@koL6OlD?R zkh8+Y<-kB&CSnDlt!F|zI=?U$xBd=~EyF<0_%e3M1#P6hZbYi^HpyU?a7@072L}mcI!5(!CXlUI#$@-J(1gOfn(v=6tXm87v%ag4FBT0}0dN4%pWhT9NYq#u5}sf_1y%Y2 zsu*-cov$GdbPKpPz*O*GH@v?l1Hn<0Z{G@`b}P4Wu|E^9T8DfS`s)LYNzG->b0quSa0pm5Qi@?PSphk zDuEEG82$oZ#5^{nq}6mjcs;Dgr6k}x9W1@W?T{@CkBdduyAH1{UH`hFaMtc-cPaLX zDM?CA=)KoL{J}NpKxc@Mp*ylZlpdVL2U)zlfeuA6ucqGzd=loAojI0{lzfy`9yZ zxHOWlhVO3CkV+d39%&op@)QSmbp(WFZt!JWshoHi)&vkg7p~CqIH! zkBuWd5K!1F-Eo2CB`0&CVRh;CVvw#-1UakbYsZ})sCVRRS-%D*+-p|d% zzJ;N4@#uf70wPF85%#)mO^PYUYE@ZvUWt3zYu3SsZ$ zXlWg1X4Z?K8wVbPf?kidz%{eq*!u-NbAd>59Y#di-p;nax-WuEuhSdH-`f@H!le^AAAJmn~&E@hc>@h*3{tSA0 zv=qnO><|c?DXFO?!{Og}ygpUU1y*J{dsri+H7+h|JRO7loYZ|MceuV>O>?)Sxzti< zekc?gel-&S%w8ET+Ag<4VdYcgR8)NKy}EGH#U8iH*1}R$2V5oc^vUyCgc${tpepV@ zlwjkRJWI}aigO=y> zwBsT|d~=h+)eZRtAl@AvrZ7u5I4}cKp3D7t=xvN8l3+J(bW|Jz^)_t#kSvA2c;dr9 z%m{|E#2h5)3-HSwBN}zM*nUFHV7nvpvNZntgyj zLPK7PY4>A5m38E5W(25vY(TpE zqk?MQm6&BrF)Nigjj}WEY?16w%;y936Qo+fkwBPlmKdkqZ&uCtM69_Wn=Yo9&qZbB zCH+mS1DG8FohpVEUTeoQ*AZ)<3|%9$@I1Qlm=A9BzYwYaAX@`biJS(f

qGeZ=+F_0MU-5ZUU>dpB_*C@9en$cqBs5C>Te}#jC{{dM&y;?9z%dPrE}E$)F55<7x&fy0(KYaE38aDOlKcgf-b3Bpk*vHHQ2E zIZag|K$rw3ev7)}?h00C;@OF;SUKpXd(Wbq6HUQvt21Ujo=0O8-V;9J89;h2)zK9A zPz)j0Bgdt-exF9wnSk1J;Y7YfhObiiEGAYcpGnK0Fp#xWp-Q7*Ys=k7Qc;N{EWHki z@a%SYxL%6LOJqez#S(KSTz+$t!{db8=0M>PGC!ko{+N)kp_>s?L?zx*6gT@2MS}5i zjUkGRH2obFrCzIj1Q`rh!*_yXYUasuR~A5u3rGG_QA7|-Fw^0TX}uD`<_IFfuN4XMmFPi1=5~78v(ALO6pV&-NQa{z&+++(M`{4a*pTHaLSZ zIy^j{2$uQ6W_0#QZEkq2XnR8TJL|{B&$yMYgGi^@6TuO0=^1GJPnZIKd9RODrgFvP z`IJ(eqr>`Thhn}7dfniRr)urn{^pgJC&CqX{|H@?CW#?9y zPr5!@|9C|m{D4#m%89&YnvhBrDt#gNy~+4{D6On6H?|)dhDb6rMKCLe3u8`V^oexLuQvEzwa?i4nj(UF-5UOAj}pNbEV$)a$Cf ze&7(C$XLe&Z+!F2rO8+S}Ej1WS7!n^1Hkj$I0{i>Ihm%L-8jH%M ziKdlmYc!_=hf#1v)6er%|?Wsy~}m z?d&`3#c%R<+dB|c{+tl8))pERa=eoJaoS{Prlg-u4_aU|pL^agYX z17>IMmI3(>L`n&aUZPN$8;>2!p9b(}Xh8_zRb*fr8`?#S3Z)t#zf1FE8vcm8Q4|@H$uvGcV?-ak#%dqGr9W2u+@v zPx4N0ZPKpf;h(d==E%*<%k1lpqDfhFwj?9!I;?^cl^aoK^Lo9nLxLrA{0?MEEN{oa zF~spaiIO0j^54o`?Uv%c&UGv$eMaGSv>a4jXKnWgg*k^O{u3wGI^WF)af1g-G3{ zsnHy~+$9WNZ4nvk!))aKa_c}UKQ>DX6?q}2(?eM37aeHiXxg(-U}q)6uCVmUpsBw6 zJhm1|2@D-EGupp7pL&=jSsjaEmGydHNA5=c$1;_j=~g`#e4z;v7Pbsb@2N>H?8O$^ z60JOt;TN<5BPSo6E;4UF2P;@s?cEl)!>|lTn?%q*nnxwVq#pw}9+_X*^3qygG8jTB1CDydUB{b_8L+-x=@=%I6Vbu}&Z9$cmLSMT&Qv#Q+Y z8Rv`Rd^4q`d)V7io{Nu#TYKt1ZOM-&O;7!ZH+J!_%i36#in&op0+S51k&}=+S@wgh zdAw28$4lJMGpKgo6itLABGY^45%yf=c}n(M-`GZ<2F+!wRB z&~k4c<)+4<|MMRrz>8tUrE8P=FJ=RvZ-Wo^8B*Y?f)A%R4KVuu!faA}1H0Vf zT%lYbzZ~JaRgp)R6d%G93e@Y+Au^){g{_$Nn~Yn??JueW&ih*c2}gj%bY3?-(2L+P zd53`94FuL_{bwF+;QY`5ed1(5xquO!@GXeP4{JQyR6t(qQ>f{!XT03f5GxF>&13qe zG?0$B)%Lo(sIqmjb6jWuD|&tOuWbi*`&vA#pB-$e=`QK8IwZZn-KmiK&F4qxqboUsB#>L1dK$e5TS@*H+;qG3N}-F)H5^cSM~agu=(fR}{4aYTzlA1Mn5)c`OvnK}^9CA?NUgLr4v)%N8= zW`e2yacx-2N~6|ycK}O5hL*aIBBIhqn^chs;ziV7!!_O)yvR+=H6s4)?b5#W0YNV-&(mK>fLAkMVo2N?<1N=mu znaR0_$1eVIh;sb~dOnZq8N{a*B8ym>AkOb!#_DZO>FJi(k#ns4ZeyJ9n;^9;Cn|09 zpyHqtQA~c}GQ4z%lnquomn+-pmAmuPq7!dUl(tbZpNMGb$o%hH5IsO-Y2V_oSA9O( z2mAbXUnWfP5lIm27P?UosvR-ptGvOcUlSl%@s9l(6dP&FecYF4Hm0Sb-K!(tQxgdc zrKj=n=ZXcOp)wJ~*e()BS^J4G$R0P&3`vd%6chsAy-kD5fk%yqm>3NAwrM3J?|ou2 zP=6R2uhv}NV-Q~lOhQbt`L#PoQH{DSb)X(7}Q48ZJ!Q_q!+T zSr_Dy!>s}V0_72933AUAQ_TxM@d0J;qa*y{7q=s+2cLm{T&k_ zFE2Y?L|9l2G`z7fW!buwIjn74UORsk)BY|7Iv*Ti22J7DI$S%&At6%{^T`vpJDwS&>7Q~=cumis|d}f-nTaLfRq^jRMiSWC7 z&WM7=Np{}ur1|-2sa9hE^%@G$Q`)?ZVr zp}gC^J-T6enbD8NLz&otFr&8*T4tM}A<&xBvh>und(0#Pv8S;S^^MG=rn z@1fh_;*x}EG0|98jP30y9J3G+hTwXAr;+&@7L4?IT*|uptv0+gq{+Vx6}PydsSed4 z*qBFvVQ)6Z@=Rw1{`oj18COOtbcWS&v=sLQr-}XZNx7|&&{6<4MrYEu?>r?%Bo`F{ z0a4NiP~KO7IGsuI)Fg|q%iCWf1)1V7W<}D?Tm!QMQ)|V>^b2E$cwHok;9cw)XVBuNT$9ye6TG)&0 z3n6>-iY$5+TK3>QNZ|ky7$B(f5Fi_zfDT4~IS5+Hr7Rc)zXt8j2fr;~>v zB=o+^&j%8nCbaE#Uh}xB|6N3~Xiiniyk>ZtAx29u+|8=HkQlO%+bs|5S%Z!V{V;pH z_S54j!`Voa=~$|(lYWrH~F;>P+v^MN=HZN;ug?$uuTX* zk#kYKO!o(0)vAb)7}BIGFh(fipPhM^G@j1c6%zaV`;Rt9RSt1=TF z=CUg))M^u#4-{t33XWN@G53`p^cSvorgDQJHKC7MtLYPH6rn(oK!BI8pQH~$kLHrP z21d2Y>t+uw4mZ<}83DXwdY6T%e%GqcMEB$v6f*`+M!9L}Yk!{BO$H8V6 z+%y&GwGS5VIS0!gt9WSzi!kzB7^aF#*yt(2!f!&C7A9q?HZURo*;org!CpA4W6@&p zB7F-ILzYS0CyPYxulbDZOa*HASOq9pPLW(c*vE6Ee%(xP6vAjh#WWC8+P3k&Ll?7xn6CO#PVVG>l04+zZv{mYCWRE1^Y=@tgx z??1Yn8L01UiCTqv4LRMjjFwWqukYkluaL$%s}{uXCyxZ_e#=RU^dGj?=Lur9)z#r( zDzEg@7jg0noKBeE-8umS2E#HD+yE;7pIx1`NNZtwcDA#-yGRH;Ti?p+E&1<}Jcmc& zbT#{4<>`HN@Q$4W;2q5gc)zqJ6A_LGj&?HP{YU-p!wG8X;NXU}I3Ig;jXvjt_D_cb ze5xH3Y?6Q^owqT*PEkZeez}^q$DH?GU-r`K!1u(wQGO(+PWs;`iFeZ8Y<=n#>aQ>BH4c1y3aSkUKBDnNC+4*v9hxN{wu}R z5)%{gFLrrj^rzkQi+@-K7qVcYkh~Eg^xrg*j%0oJl=--TUt^bb8S?iRR8c9-_bH@Dx54P}na}U4QYSf5TS(b%_%~M)RrM zD#LzpuQ8aGlw3Ug?=t`T5DAIIhY_5K`kU|HI?4a%n?9)!Z(kl064UjKN=fz<3xE5X z4-_zE_(HW|*+Ks9n18>o+l|4}Eaq%9bH4aS$c3<52m2Ql9W=;-TmH-T?@VB8Pxza6 zoWCE_I$C~U9K(QtVHv|CnIb z5P;z1&S}-M^xuAti_p%%HyHVHDdbfzo?CtY-5USe3AJG$)e>fNkC3+iKX@u3A}}8j zB5riE{=1X({}Qj5VSw_T*ki2zP2c*oLIA2^6bR|p#~UT3zHyc=^64yEIwZg#m60Ri zpmO96Z*TX+ma5hsqfI!wn3m}vr~Y9kiavrC(uaxd(`$j=1K64sQW3~8kJNh5B7%bM zu&v&YpG3bF9UNe(%j_?9$o1=t);V*X<3g~0^GQ!nkBWMq#h}}6WD@Nj$TlI?2muxd z=P56_&*kkU>*4V(+~(!QKV|C(BqJOMk*A%{9_TbgJIZR}`I@bloY*YAG+`<=06(ik z0uC8XWc%5&_tn;T^g~y{kA3r_8DPSL){cW zb;85L0~ax@A^z~AUS2FLj{5!L^!J+PG?7oPCwnl_ey9_#X2(r^DNX_j7llA916K z>HT2HPxFH!z~I=9`` z8VscbLN>twix<7tpWCm$Om-9eU^oe9?H;El@oU`8+}a(sY17zHPKoDtFPEw1L*xuv z9}6ne%geK(e2bbw0#%WDpe;?+`jt-0(W#qwS~!PH0%n^%UUu(5J8GtQk2uV9EQea7 zhEfz{B@Y#y744On4U5IwsB7XC^a9|V&$Qe*Tl>-OgI>?lm>Fj{*qNAV3@ps8^>t_R zT|%-?(U++|(8~zlTfckwJd(XmlJ(Rn@2P=+w-%N_!@yO2J8~=~J zz2z&HcAT8fM~Q;U^7QhkC?e}nT65)XGv#o0G5qrMI6mA|G!GEh&Gn|W{#UXTaOw$* zrLu#8j7KV4%0m&yJ)f*3tXG~zdZzi@ve%v@5i)HuoIM5Yd)Il~`-rCos;TL`uMj8R zBVmae3lsSR^)6BrU*T-7Ot7%A2P|XKtG_;b_0_#;xc7aQ_uk*#OQz0c8>2RBf4ZZ3KBLSGiA{JOfr|OC*0KGSKoXyoT;yrwJW&eMT|vyhO;u&}tl2{Wj# zH%?6}iKRcWuw|$a67Nu%%mxC`C7gHhWEoY~nt4ij(wv-(7W=>E9iM0jFpkjX-IY-`?09BqL$OOsgK)Kg1Q46i5_oAMCd#H36v_f6w=73DG-4u_a2yR>s=e zOPEl`o27S1BC^VW3>mw#mu#%cj(34Twb%e``s$Qf`euDXa) z6q6~}?@x4&epj6`#@2;qg?Sh zLHu33{hH8+MJBFy*~LxrcnEURXib5hOxq8S4N-6z4BKki-@fav72P%V4BTeGp}Wyl zS?Oi!*2OztB3xVks1QaJAidDS$;rQJ`E+38IAbdAF!%8NMeb?HJwbn4+Ua0qb4va7 ze3;?vWYrqPT_IyE8h_a}dSE&?DM{Bv{^P~%%e`Z)q7(rm?mxmRK}{Gfn5K5qVdRho z;D8Fdk^52s&!+P=hUltlwss?<2^tnQGP)ZF-Z;fZ@5dvSfoNl`rb!wlw@@L>_p7O()(FaRr(Rl zammhB>zczU>9}tgq|9S>8?pQ!>+ofS3c?p3@>(o7N_P);M+V}KzjU}X+*F)AU6jm~ z%@jL`^tqf7aO1BLTyT*`ZUaaSe8HB4+I8H1*~N(L%{2~+353kjXG+B z=&dgSV-u9_Ig-pVv0sE^dPqrATR-_bGmz2IF|uAg_C@5$Qr?pFat24B#0;K9UV*-@ zuEmsRO_elNP&&gS2bm#37Ui27k>=-mgoe4J-=96>4t7LGM$(n29;vj{rv+}BauoS> zRawpr@F(THMryV=9R^#-&UM{ybbD+d^X{ej{h0$xX{%DOqRo9r9tE5UG-}jU2UpziqZizWI1oGrI=N?#~{hTpo}Tv)bLPWmT^Ir z+!7))^Ux>%p7B%Frt~tAp98_K=;-50nJ__bOm84-2Vx*(St0R)3&J16S=bKiHVogs zOlLL3KKoR8_I*HD;h7t^(YdBkWgOUWi`?X>z&}}JbiVDoZ?8*nKHvzMYTGNMzV<7A zN1PgPgNHGOJ7444H65gTJ188Oy14!JId|G}_;Brz^>8_B?_=@mx0q^Ld#}&6HJa@1 z^WEOvDwwI?$Avq5(LO80e_MLg5NpRbD5IYlq7J7fBio!E1oD=jBG+H#Z8>@ikJTjtI&ezws1 zjo(}edLug&cHn2ORxitU+`ox8x0!rnw`T;rG{pCos&|=~*iJb%TX@rzlkevoUk=8t zSFqvNT6PLAIGl4wbFt?9z!bu`qmzPs3Ipyr39=TARNb{yE9EP0>iUyweJr7$>5qPBnMvI;e; zkq!*G=%vA;kdv3rYMZzNWqEACQHaWBjwEJ-Gr(eQr>Ib9po3J3j zK#>>@S-i{Y)u;E;0~M%#gY`oy`|0q0iS1SQrxda1KO2&X;_`SfeT>ILTH@f~c&t?r znOmYA;g7+PpI;F6&`LrYz-35U43%|*;)0J{8}nHmBaOO%WOqHE>z=b93-SN(5&-Wl?z*6Fh7D;K2L}7I(H} zam`y!QudCc#vktWJ;d-AV}X*1%rA)={HC0NQ0BobtvG7J{^B0$IpCFt=Vo#!%+BM6e}ukgS+a< z%_sMy?jkR8AtoP1)Df=1_-Ue4fxx(IPOIVlwa>dc7hBkY2V>ngmgjI6((<^x&M!0Q zUTAm+G`kwH2{E}|HObwfaQ!!d0qO13#jr&G@ESo-l1if=0t5nz2kEcRM?9r0fx0i= z3~Mj14vx60bN7~gYmkqa%O@p>lyhG4vA4DY<>kZ!W+sXYRlOT%-8flk12e-cNI%fIC04B-jb=gq&|%5Hp`s@IlypZw&}W6!kRnb^)| ztHU4nmH%8>z}3!rbKPfn5AiZ#J6chz>#FKxAEK2R6sj9bz|-S)?hM@fvC7&}B|wor zPrF0ixt;ZD1!pgtxPw18hFNP>rie-^s{kzNy?n1s!e#LwjOr{Ic}x&%cFYwuQ({um zC%gWsK>^nMMua#_l@XYg=*2eu(>u`XSlCn~dATmAASXJ4+2Pw=^q;Io!p2RD{fJLF z_9^yU-!O)*rmJjzD!A&aOQrm*n!GHyP6xJ$4HX%b=F*(0aLD10?Fez7kl>8Xr_bV7 z1)k_9Wff$!nU5?nQ$!=hSAS(>SE~Yzf9G=knHBPVB0b+}bLu&C}jhom7Y};>T@PB>O+wQATce z(tVQS7kgixmzv^imB~eu&9bSoAn5S)zKMb>j(H$?|N63Fj=@6irEFR#%b(|&G<+-0 zFN__3D<6@ajb-CACFb2Ijk*H$`S-x+hKjWc9m?pA%J)ayFLD+zY=PF=d@Di%P6?9C zEFB{m_0lxJX%hT7f7FUHL43h67{0(EtUWfj%-y1d1S@w|KLjm%ULOj%G{(9%1O`Yp zg~~vN2+Ddq6~@u*K|@g^wOSbIF=or*z(&-DhnuIc8++XoM`Z#R%w+V<1gZ-p;Y?IRyNpJ7P5A~H-Xj25$&k2*DIO5ct-|3F8$k`;MvLy@~D7> zgf!`)6)9QYc=EjKaPmQPF^Q512Yc4n$ecRs_lBq}R2J$9`&YcZZHhy#kn9v(SQN^` zCWy+eX#uJE$|+Bd6Zs|j60_`Sw5V-vPPSUF+<=7qKz3^Q5)%n$ zY-A*zf67u?llA_zL5}P}9iH1)rK5;|Bz85gNI6>BkiT~-iGuqLy;u^$lk0~Iv)QIB zU_MK@*#Ba_(LGf#`b>>(!C5b`bZLd-UQ&oS*&X#*k5{n$ARJ<`G_^6{?hwN)`1SaC zcv&@uM#X-7cAPkVHo=ZkSA)$%X)_KF$(!g-Ox`y@aZVg&S z*QIyJ*b@3KoyV2SvhVhczg)A`a6drjf*Zv158JjT`WByDzdpw%YzrNJnqNZW3`I<0 zDDnV>yf|1V$5R!%jkbsu*b%)`GVLa8X)o`e4pe&IsEqXrD-nLj!yza>ChPsjjLMQOtrkNR$qWGmBO&L9vt4*b)9vddVhb`JT zH<#C&K%?MFA&PZxoN1Gp88)^CdDjD(yHPMsExQ zB6%J;PWoU}r^r5pc^$pp%o@J8gn|-Br=F$KtP=l;f?t2rI}-X4*pWw*&uZS`UPg8D zdplnKnAHnyL&kf#x!&RuaDF|h_}<)X!Pk#NfWcy@_1J$aN^N3#FXcJ0KdtbCfsYAEP)h0lrbxsoM5#Lms=mUw?T4yh?T$H zyUpt$muTztwnP<68Jo?icNzI=%hrNEx>fesi@(*~yl?)-oyVrfVimk+)E&;Y~b`L^bh&{d64zEgRITt?v!ZE?wwp{aeYyE&W$`3dDa z+MSallOLsPKYnC2(i&#vkb>5|xG7Dkz5Q1L{x3xcELaH0bT5~^BGD!nse=3yG*?~| zz0B83t$P~>N%FkDAZG%GFIobPm)qZ4#slo7+TvEf7BnPDt2x>U7V`G=ATxr8 z`NIs7^}E!PkisJQQYv)oWn5s6cXwJCGTQzhw$3sr&TiY*2`&MGTX1)GcMI=(44*vVN^ z7afjQmQG`|4CQ>lM^{vaa(Z#A;1CY>GM|N08Oie%!cb!%~uDUWM)lpGrTyiq*Luu3GF9;gcKokn;pwIIEe} z(;IFeaPzq8jg9*0$`GZd#igaEOQND;Afeq0K8y@`#GVMWqIqXD*5*UfxSAc^vL)NU zE>FXXbBI^%okk;y*Z}i&PqaXy~rls+XW#O;Trr3W*XrKI~3Yo!yzuZ_E!d+Hf z;X~J?h?-^fajaijRz6xzHXYyh0o$DyXuAj**kw~7h!l~L&JYv2;1yKYFu1ga(ZSWj@0js)nFmuHov9&gjQ1_ zsq^mrrPoQJkWnpXHBa_r%}t6V8}zfG{;UsiS$g$svRP4(VD8D z*xY?>ISbIoqLe6>r$jRLKe0JRj0St`ott)R_Bi@hbi+gXWv7@^lj!5Ii=m2pWPzwB27 ze&f5sr27;8=Xw>~h|I3!z1<40&|gTpH?=uHj+Ds^W^Ih-{2PL4-LZ;5u=dis8N!d@HUie=@8?_jFo%YaNL@j$g9zZ1t%SWVt2 zGQPk=)2DS&j%tOILxj)mv_6RbtH0$@BHRweA}m^vCqG^+F6*`<$f^0#9M%aJaXpWMV;w zk2t3?s{TYfdEP@5K+wEBzahl+v~f)5Kg1`<=yWm81Md`G+std?Y&dcG1z+>1*cDW( z^?oKiJf`St>JvqT%~?w2g=F|-FNDr)yjJccy46kl$ai?CcWl6c!go7?=l{2H=x>?*(kP zji_mbD<`+(pL05IKgqZGhh~qMeMc5i&n}-ikJHNCn^H%dw+oA}_?Ks4#SvI?x?(Tj zB3TQ?yUX6@W-Ft4i1)kN5F3k;_?0e+Rw596Skyg>Pil7>m!xGu^s`{mgSWTD$B&uz zcGTp>)-&aRcK6^jUAhLY3$3>@&g?cmyrxn(LK@PLl}o;U@bU6$1!;HUoc zc-$eB@ z04DVrqRC!eBOP#Jfle`;?Lw90f%gBVi>z#5c?IF2l|kJdhVlJ-dt1x13l%~^3>dN9 zBbNvkqY$O};Zhn~$}uoQdcFT`2#821h)wLGRBjLJWmb{B@3SEM$RdhP?Y4KQ!KO3t z2W`%uXeR=umn85Rz17bx8O_~jCXIgcLT|eCyz|!{Yiao7PFp8r^6TYfE~;X3H05H`cEapOU`I< z6*j2sN?(?D#&IX5Np;6UAXlQYH^aZ~$CHkl8tVp7$70wK?_+(IoDSUs@?VJ9CN>s& zEq388(dCH2(}FFh?-Ev1l>1W{dvWL&&W85Np0BSvS1RH?MF&zP6z}CQ$dOzS*?586 zw=AA@*}e-UQ%U_Vv~a!XPKzgbnxHh2n;1^AfRo>FMXo+bC&J2DXwb6EzQ@(F+5*^020VX(b--5#7^5`AvsD@N!~UH^- z{38BgG)A}{_!uH6crrS=(8?~WGsbP$@G-3=J3HjzLG(T}HC2_%5rM)A`whhG&JLMQ z(+Z!P4Ya1_#P(Ao(3vFGS<3lkoK*MAfxYEuTSOE_1Ij<@jx^OLqsNWCQn+yBB-T>& zahXYV%kKXE`rU7<&awp^KAU=Oyku#SF@*_otJfLiang8K9w&KOL*)AV(N|aXK+#%a z`m0UH>s04*F8G1#STLKcU2*|RA-`W$Sb=Jj=4Z~5^-qu{e;hhE?;DJ`_)nPT!O(!?+k0b`v6Xi z)Gw|6_y~Abuk`maxQOXIK=B)wU!*X-$?*K5&F`Y5wuSG{8-aCay1mc%Oo@vZZZ@@f zjT>GmRwGmA;eH?x7^lj`x-qj-8R-ea2rp0_@Q!YU!{|^FffT_WX(J`R_?o4u`it?u zsmlUy5t!$b1TarLo^{|A)ppwvH`ss%${*8&;yY{Pc8|t#+DHW@`HG}&vIhG7L$Z-i z4i65d?UZ_d5-&oY-pvQ8hJvLw(|=gCJgyYmZh|YQlr0Qbhvq6BTK(m1 ztb_u&s!5TLd1>^dQcgKYZJ8waqA~=f_p~%J+?_PL9WI@Zlc1cgvuM?C3%93j^}d49 z{XMZU-6>NFKi_u+x=$>_)IYmT;0V;54gHjtB3$vKj5L*Bynnpc+~Hu#YQ?RmN=@0N z&3iz3^8cu)=y7pm91;S2eJRw5x9-@m3B6yL(1oOr^R(A)cj*?BEr{(fW{{ zrvC4#sRAvQaJ=Ac`jzTg{?+)hKXZHB1_GcE9-Ej@8S}0^y4@}c#_M}RA4Xc59vo5& z>8$w!eCg(vp3uybxOi{Vv?@e=R_x}}{H(3Z+A>p)bc8~ae-0h8AF!Is7(@$}N*!EySG~RxoegZ9Zyt8(J(PPGG2!iAqr^ z%BJTCfg-gPj0v((RW;~7VF?W>46qt}?Ry!RFLXGIV;C6l%8ow*Z)RXv02kIs4OlV? zt*bJ8?j$b=I&pP7;sBD8+fR_%E#TU#SQ7a29CP-hiUmMBuh#D$Gua1umwtwTwt9Zv z^@N5gH-p2>dXEI1_rP42oKyHp%ZBL3+2Sh%a!WyNi~Gu#+`Eg_))Qru(Ef3xpf!al zi|#YDbDH|D54JJP{R`I5|ASim`^=_}Hs9zTpt3PnU!17iVLZeOl+f_Hi9*d9kT)=N z;LR~<8WtHrl%mC8k-!pMs`A+@%J8x@`0XBZ#^c3eP!j~5=IxXDy4A#0`_QgKVG^q? z>M`YaVXgA7p?{4 zGXLULw@80DNf>!Q%pqJLE4b%Bz6aTr%YsBMbuj5twkt6U<(0j?A8MYDehIGXygEcs zw{}{ZqgdPn(vK;5`c&oAkAtwqmNt6({-nP^o9}$CF6`kR zpymzLZwhBVuHTB#!PoD6)n-3@>iuTN2i5SPR;rXkWO1-xCvqVJ?-YBgKRS~D*m%$m z>0=c$rE}grV-=otZN;wP+1W3pWwwAFe`)hes&N_QpPYh7kq?a|8gu&ofO@KU^KSSP z@7zkRm!f28{7DK^nup_$vRYqX$fdz2A0MgpfCfljl?iwYPiK{`YkomqMW2357fG|i z5*AfmFt3UeOu0Z;Kjz*8*ZzWI&j)`a*Xc)wih#{+1UZdif~u}&=-_3gOHybcM_UbLSms>10IK28$Uf#y( z!Q{~+z74|d7lr5j&%$FtgqV}k|MkZ1_bh%>m916JO2x?UUiQ=Z&pFTM&%13(auiL~ zf@_LnbETRzwmfecY}gLHJ$Gr^aq@OtZKo@iY^7mWa!ODPfCK*EkQY2AF3#g3u|alx zChzVprt|W>4*zA2A`v*g2T-noGjiYJnEHl(Wt-f99fcoP^-{tiHT7Sp#|kkptSOEP z+05r;G2awYFiT62QOZ}mt)X4V4!=rJ$s0h2 zn5yz=+`QA4YxO@L%{sJ06Im~(u-EmeSWkG6ZHbrEv+Me@Dh@$If#7TUGOZ4OZWq#d zv0~nWf-%t8k77yQfOn^}sDluk@q-3!L--#gygu`1RtnSDYAcmqkB8eKLhHiJ=+lL{ zq883(U16Ar9}|-*8QHpe%khtAyGDV$Gunj(h0ml&FCWSyR(OGf)wAQDl3k9OJcJt%}*%=aphBMf{_#1oP%3O7ErDn z7m|qleP3GuPvz`FkHQV#jgkttp|rp05KGZ4NN2j};Y~gTgvKyFCibJu2qL)C-sxzU z+Cl|E3y;e*p{PNCOtZP7qJrrFY-@H?C+R02XoERHWYdZ zYM)BAJ0rruo$xn7uk4`QxA$NdrN>EgIXHq*8(0T~jo|!vJU2QH#~g+R$>fE~irZTU z9$MZY><}13fl+hu*{P{4cAsXGK=xonOq+#KLvGY8RPvM{w0|wI^(j9`1CSBqaQF5w z92Oe0B^gRcQ$}-^=#(<@mrHQhFe?%nOstbBzOPQlcKi-TFl6ogFz+NvJyCy?TeBSb z@>;C0r@Ka}J(k1apsHD4*ms|hqDW{`2Z>PfzA^R)zQZ%wkJ`b^j@$Z%+1rR_IITK&oci@!Y&8FMnBu8HOqi&{v5}Ul-6;EMl35DA`3>VIoULpC;4lg6x}=+W-S*r@|QA_|O*9Crce~IbVo*W!Tr9 zs(3obi1!=rvR=-d9T~Q2t7LE5wk?D2F$KDsGNor^xmD*ldH7pXHPH~70mX-H-3r|2 zFKDtib_Uc(`IXK9kFUg|g7csGmVP3Lee3<CF5bk zQNzAhvg)c-l;qGv>+uI1<%~6ow$*cYwp?=l2WR(p^HAY2#T>5JuA$G<8Mg-u!F;Ey z{#)Dm@kEq2jUM&uOqDu6C^n+|{cI!??&wn+(@|BL=Eh?PF_%0+QSqOX8`646TDf2b zxUp>h&Xe?y#4Lg|)XZ9Yc}6*i|i=h7=p;pl?Sb>PZpOst=tgF#c?_gi&vr4>gdMLoR&O$G&N@%tu?ALRrhbDr( z2o(AD5Up3S*H1+xo8DM1=0rx9Has$^%#hw!gw41a`I%NC$Pl3S+}>AKO%BW6D0uAd zBY56LmS$V5Z&E~vnc&C6pU~&35iQ})Zm@Q2u?;38ZcZ-kj11s?d^~omEthrFEl*nj zGzm%{l#f~QH6a4kYIk>c|6ni{=^fH`@Fd0XZm55#BNB45Ae81GAdMIhHi(OvAxWZ5 zDEMW)egxbgU`~S>cMkHZ9s7q<0p2h^z6Wjw+^{}sJarTf4|=lpwOiHcNEOZsA-MeF zj@e1+ikEHaZ1jj`K%EFx|DNYI;O5RfFUI6}jZ8N+5G&a|-kPgGtFePeMj zoNkbBiu)s7?hwFUx>NgxVavuT6$PhXFT!J%@{U~o2BmyKMHbOz{F4|##7_KAK94=+ zXPPN?0co8%OWEr8VwjepRfyt(m0@c~Bv)N5;827NP%_5sX&!k_wwy_5QAQ@5k>aKD*mcyh8~%%8cTeP_?fGa0wt)jSHgiBc0&n(Ur4v$c)M#Z60YS zkc+J8&Ta$|3-PGy-(08|r8+F-3J@^I|7JwT5g@L5WorUNHPfn1*NdKPR4@GfZ3wa; zE`~SuR@iXCU!|?BuhXaBGlHD(-BrbRkF3S$rYGtFNq9zose{mL*f~~|jCxX7QnT`y z9=PW7H(fwX3;>m^wq?fA-wW#!u;S25G3h;*FmcBX^j}o-$P)nW@Sh=7aNCG^7|O z{2q)P0{YY<|CAKs@Or#H%(NGkXDXF21VPJB_ER}q9}f=c161NK*j|-)u;VJ7jmx+s zRpQYB>&(b#5?cfiv2gRsTUM{1!@1lZ7t+F-#=Jn{Dpe*(mMM0zgnN@o;|0Jmhk_S@ zy|nR-q4cj#YfV9lHi$ln(A0G8vkUB4$8%9=a!k582Hlzv0Dc8zyWDsP0hs3nzcqH~ zqkitKyFj-#i=h~?yL^PUbcpzt=vH3tBr}-ek|G!)b4Upsf(b?m^LLNdpsFfDm+PtD z9)~aKKj9im7dv+68`4lrc-+3M^wLgpTDa?=FPj_+6js5t-cf2w*{7sF>tx zTrSo?<5PC57Uj{*{(bMa#oP~}%oZ_}KhUc$#c#Z0{437`!CF8i!B4K-$Pdpm9&z4g zU%kxa=l^t08nBijBc%A78W&!>_5KL_JXh_)is<#0c>L#L20TixgoOW#@9hj7NJA;hZ9ifTx5TxSB$k!EL*Uf}V1X$J(xcJ8_*6 zKK)H?zao2uhd6Jk}L+_>E5YaWhiqY$qkQUCeO{`8==j0yIIAn;qS+TJ`Ib z<;y60m#6BzhAaDnUNfvr@s%nk_o{(1GUpcyfgMpmLXwz_y7WJd5i9mUfZ8%>^_1Gn zud67aOR-Zg{l;``i(Si;w*56B(zJuSU_r)^AHLJ$lEt*nnlC_7wCP*=Cyw2Tu>UnJ8?V$26dRiB--W$C@@K*$(;meY$gX#CwsA6B1uX0L{7#J4}x*Gf4d)p8=>1hXe2Nsm8bD7Y?G=a^+mAfPpZZL8q%`J->v)Xw<#= z=inrs0Jn*&2Ke(04k$sX-wfH;C_-je^=}(*z#Cki1lr^rPjoCcBJ&t5;(y45H*f99 zKE158qp*NqW~i$MO&e0NfH(tfJ*QYjtfuwr=lJ}HfbJW@wETAu8JaKoKSq3UJzsb3 z%}$`)c}{gWZ$`a#6_@+;uda}A9+`4hyk2jh$r5W8%Wk1Gn-JPA0Q_Rp6ioEVK%R{P zuuP7qHKMnc29{+*w)<;WWV_;6lCG}+BJ>8;!tM++3CW|U+RCEnE(6pnE1Ufc@)p0T6;|YRD-YKyQ1S=Kk8le zbZncfps|ID>S`&l&3*7V3!mWTQHMRL*;ZO$zHLf?=Kpvhgcg5&#DcI{b5vR#D8M;&!url0-pBXsE%>MjCK33Q&-ymj3<& z3;0J#=)m)*Vq+7tYE*HXS?a)AP%D1se6rTT%V6&BTameU{)OO!`LeOzo?-WfM4L{v zsvIMOi?NQQT?X8014$#6*s0d;AeVJyh-7woUx0t^1m56D3kO)ViaQSTAP>jPf?Nzq zYSY{Mghi(8C5uG-#-kBwwUz5j$Du|3W*6{-d4jsi)Mu`2?D!9YJlB^djN$T^q#D3} zo#aXpy{P$>!XAW&QO|5Q=`83=!Aom$Y`D42TBJO9>=U(KpxxiCj@Z}J=R2&bStg9& z*y+8j(WV4RA=vRYN;9tn5*NOuoD@i+s>cNp{NEw<&uai^eD8Y&!RCi4#3UJiy8LYu zn?{V`G$bbg)1Xv8{_5(=^id2a$67z%%O(=5?)KbV6*o)3i3Z@3qrguG$+wK%iqCg1 zhz8UWD!tVv6{BD@J2RT8xN*8&M;1t;6~9Iw)%RrdCf1xUw4VGwkdpmrZfV&@3&W zR_4Lqx1mZrWRj)I%NK(A8Kn1J7>8UH+SM{h{`!xNgg&030w^9FN1UBAg-S9LTF@)S z^l0U}+v*g$Xbnt;2oo^s$u;9Z|AO~udy_>&K);D@h3q8(FV^-e(7R$$^Li2DQ@BEN zv&hhN;@_^QSi8r!|CWlR>Qv=sS zlzw?ae08sNH7>N;l)rtrYjO%m$qdYil>7TeK~^kr)o&!_)2h-y>S@kLY~P26heJJ_ z!8%X?x;QLnPBEG_KLBTb1Osbt!7e`>0fdQ6gJ@4DX`_0|4USeDJ%PT@uhO~858{NtVh4sjW(=QU6H&7pfr?_7!eph{2T(z~F8}*~3 z!MV+8_|UePoc>P76~pH=q#SE&%YN$Sh@$-6&ytrBYtN4UW&Tr%px{!B- zbKMi*Y{RY`w%OvMV!18X4F6yC>S?%XXfhh6SJN+zHyTk&X;qq-H&ojI&@+hBqxE+V z_>!L;+%Ez)PY^PYt4!WTxPZf)UiJU4Z3_4Ul=O_HcNg33YRZHV7hLZWGqKrT}M1b5%xxxE9oW^e-Sp$7S4iP#-p*W5hem0H^eJJxn4 z#cfU;T0d$$8a;-S+bIMDj%R1sY*HWaGa-QF_*zB`#sB8x|0c(UMaVwgKUFe)DUhyj z+n~rAR`))TmNCmKsgBtbh58_^z-bm#K;AL1P}OjjFCqr?kZBqAg2BMgR~*|SXEQBQt0#DmU< zg7#n5by1^HRLMhYrD^fn7KG-ZHbpo6TP|Ki{sgspq?NVH zC7Y4!Obhrwd)t|iZZaFz3g;VH*ex}yP{?pqoZpRPR4CR`>>FW>&$vVMf`su9mfpSI zDX+NQe+owJI$7OSbr`b6dH^Odbkfg}p%OMWwx9|ZS9i9iAE>eyP3@LbUp&_BCiq)~ z4BHjxuKAMVGn*|Y=Kr`EC7f(cb}LTIOexX^o$t$%j#X+i#Ys-_@_)U{e}VJMr80bx zZv{?H{^wga6}z`M##PI<*s-b>9+H*DLk3agu`Tv3QKec%1`2&slPc9gFbvP8RQ!brdk)q$;!ItW@ih6Ta`Vvax8=k7ZG-2)vC(z&j)IH;{ijFjPY7RH%Yb zPR%S~fO5XJq2*xk;w{BeZTVV!WIH|;>=?m^h4}xicmU)s3c<+_(^Gh0G`)XwhjYA)RtB8kYpvkV{l2QEy=VNb~cHL-{@H+6k|^He6;v} zYLbfljWq$W$E7wPBg)TVqI$){dt%am-L4zE+ZCW`P)^}e`& zcC=QM0>vU($?|et?ulRc~wg@56XwBEL7UnqE2Ff?hjI~ zE~6ISk6wjW|rC4@qCAI-jj@VJ71zHITel@XVBuv{o{3AIr% z3oVv0_KO`3ltNh^Gcd>s6rNryGAnJqt|VpwfY8Bv$-paDoyRsDsTFB3h$bn~v0lA^ z?Y16Fo5GEW4HokC{O!Mgb_r9z2eQ0`pwwX4nvWs;>c5^;B>g4U&bX=t)|lBaE>I)Dg}i8F^g zVo<1?;Shi#B7l&CBb004tyBJqJNWx|9cGKAt7y?Y*@e?p!QJDqmmillc%(G?RS2rP zTQ3YZgaskZ<;M~1p5v4(ySt6EGlw1Ab%Vo;{8!$|9anoo9%79$3 zGLfM2>z%YT$cKY-I5(_FI%|B4#c8<9{0RA>hOgIRe1IoUu*lt@-Rcm? z^=mTBqYlqDH3;x0 zSs**#Dx0V-V-e(+?{YvRhG^Y6^XXJZ`v^cPSIL!+>kGk6)h%^a6<1E2V_Gi4kt0b? zRjdIpNJ7@C-FWR+=4+_ZyY4M;_q*tKK@YUn_!&5uE^fZp{EJh!d|ZU2!@?Av#!!D| z#k34D1;g=4gX_a)qY=Imvo|hx@_iTZ-d8~WF;TYnw;j^nIx1)saj!V{g8Wb0 zUz^7!zg!-29TJY`r?_q^6V*P0(u?e@kBR*fWvdqEBaaH4BU0XvN*6KgeS6M2v<&WY zp>lU8;*Mszqn}1?+%yr=hZi7CRN~78uH%a@(N|@QG{hpfbLsmzw2M5zit2Z;(&jB* zodC=AP0;D=#$KsRM+~Ubf#z_vT1t5e)39-GFt2@`tuU|Fje*gHgMMw0#td0I4wOFL zOr_i~kT%;-6WeXJSQCX`Xrn?^TB;l{d9Sm7xFJ0@^#!v%j^-@rSBSvUm|`|xDIa|` zJ|w>AuKMhB{Vwk_x<&q33d-iDNh$X`GAT`&Ki(1=amfE)%e;ZxVhh|VG}~c*&$Fc? zI~()~xZhV!?)9I*#o)Mg?YJ9ihkD5=tdfQbK%hsn29G^d3q zr+WD*Z3Si%x0jA=o_Z(0LY;EzBv3508-J+$?#6IIjN`+1eK1U(0TfJhi}&|09OfUG zPW9!I*9%=vdKf7?Tl4aqvO==VK2*_in4(|8cRKR4I3DjLb&^w>A@&WqMTBvU?YX(0 z;tighs#cQloa(j)cXS+}Q-^uS$L4&vkV->|-E*!D0sxCbZX_M*5=h+4L!YKPqHj?<#5p(4%;}yR~}Q5 zR89PJ+wqsdxWNupfI|9>j`Uy9#^2%aFMxwR&@$G*(xSHPDl%z&k9Ng1#`oYU4q4%# z`88wCaphl{q{To(qDZoTo%>5-qaka3D28>+Z2` z2LKZRwkAA-(tYIplm<{9x;wcPAHYT_1?$(2Y56>S8+q)f&iz|||Cs9w*= z2SOs0W4H>TM;zZu+8BegI{JbTzCmkqO0;;lsGS=69I$_%{b7C#9P!|>rD3pN8YQXo+H2k%5)V#JLYP83;+9CD*C9jB9;BgsvA%aOFr{g^H5|K5(K`{?BluS znk!kjVb?ty#JA`2IxVCb>|9d65m9Pq+I?{ds~A44Sq74PI@lSBE>YOuJH%%)REXsD zJj;#KqL&ZUu1;p$nz{7B#XUO3E!i+vtIb4lG*rL`i(;v!rQrfRo*MTWVAAjeV@V&) z&-0<+uo+H{tilL5?~S|I?FypruZvY>7Z!&F-Qwd!kVEv(B(@>A7V0Xle&#u3IzVv* zNw&}0Vnxe`jcIfx-xozk>(1nhuai;v@&27b;#n?_T6^i{g zuKCYP1^pj*#TA$$P^`iGp@2<^iyMwXrgAK*a&ejT(q0 zR)@nO`*gaynw;-}1JZDHbGxSYHF^5na>XaV$J`t7JzdEADQZMCFaEf&&~xQW+(L!@ znP5L73zLbxvGpU*PQj=Cp%M>7gH(~_zEv+zjV9))O0CJy*&2v*71@pb?MBxhqqP5T zo34Mo7N&qob-go7H~~srYpbL;e6ShWVhc@27~hyY-l{=lvN(I*(ny*ZYJDDq5uM_Oy5APG>cfFvcFcnk0j#bi z!4idyeXzTS6Tnb#U`+)5ioefTNQj@jvZnu4y5X#l1LwxTczeYbloIsA?7zkGsIHEu zJhXO0Ds84l<*DaZZq|j}Rm~iXHRyh7CS!8gpN<9Lj+Z_!#(Eqqi$dOqotUPPMb&=S zDP=Qp?zF`|1!54W2x*ORHS2}oZdW%KA)jCG659{Ax*Q@_I%0CUwI(Qr9I(>=>M}Nz zXHU;Q7Lru>_()H(=9w7BAuFx7C+Hcp&3^J}Xw2Ak4V615{^!Px@ISt1HO zu}?g01CdYU9HFLcq}+QvzeLT{OhgAgYyt<@FR|wLF^YMK;wzmhSqcaBE^1&?H6dH_ zzDJzCGPY!?78egs;J}yaRDyMCGSj;7y2gnCwLIp|g_RB@Z=^JXd{+rv0%>_n8l?h9 zBkIjh6f=8QZ0Huk0rW!`x2({jN*6BxA+?PPD7LqcaW3XEyRSg#jtZrUWt7VfoDo-A zCx;5*7cz2m@%+b#%HabY-pm+L=5^4I`o_d8wH>77{!Dt794=-%uYISiMa9r5#M8Em z@P0G;TTs3$&^PGn>6~Hb-~iebsGw<0%JLK=a2?ZtljA**l5o&|#wY8i@bW9r-2-onq?^7tl((@6&I z$=+_X)$oM#Hc|E*IU0y^AojFqaWo*$%NFw#!1Ft3+kyF(GprKAH-WAn8~ zG{Ue*QO{lBS^+kWNdul*k(gVQQay0Z+%eEN1|)9%SuAE(X5ZL?6uRYUikC z{Td*BOPB~-S*(-`X5GZcL}hTRYElIK!+=c#X463UGQA$&EGU0Dz9=%KQY#&V@v^bp zfarmfN9j7%J9?`$L1pesXJUP`$pl%Xi^Jb2bPzVVx+Xcj+2sU2rZ35isO-d@ht!e*F#U!Xa9W?*xJ; zvv{@ZU_TXXZf?el*ILW565g4NgzauUrx~}$W~4N2SKt%ylqV~CX(rA z)d=RDZIsApAmVmB29;gj4icIQBcBD6NmI+=LM3tMWM?xp@q!D#e`FOrK(GQwo|6bN zJ}d;A0uOm8h4qs=jcQeXPH8P~aAM(Vr}bRU?#YYKYaPqftn%VupRxgL=}TW*S$pyb zA*J`4u7NGj{v}GIv}$xtCi@CYgwA>}f^_L9%=SykE32u{Z}BRF(*88JJ_UcbMR$Dl z7CUqQn}*1~USAb(0^h`#uw+aBW3Nw_oK-F>Q-t=0{UB+L3-xHJM4R8J^tgKVXB&T$ zq+3X=|F*)hK*XJbU4(#GQ$(7|F*UmC=7F1^Y_b#0sebX7*w&ixxQ(0Mi3e5`BA<6QRNgYdqPjkGD+_@0!%y3M?kxTHJ+dbY zn?Ysl&-N*haZ0#;S!zm-&G0$DSUJJ;9mK&YZx)aBf#&L9jg5CqXfymGbqix@#^fcA z%x=HS)QUf{@F>*$95k`obgYFbql49dLrl-dVXVtgF#}s$Q!`%NgTf2(nZKEZ5iD39UKR8)b+omSTNS6PqaBqapjOM7?xb)z zioSuCBjE3K?~8N%YYan9MnOJIu6(XgGVOvQH56l1*4};fyS-7TG4IRgqqQpZVXYT| z++2TkN@_B>%K@tc5|QvBy-r`MC#RsC7Hs9P3?B&QUEM*KLiJ|_y+r8K~uLX+(+V)%$3XCyi zRCO6++c5%EyMB^7e?aA_P^PL(6uGdmOWUl$ai)j*>G}``8*HYsoicq8fz@g|PZm~D zH`SL8X~FtOjH$OmWsnG|tQ&Thje&GKj)T%;b$n|e^pyQfr~%OFhHXRU=x%Yp!nH$G zHt*R%a0a_sF|8EUql`-^9T&>iGFkW8ftfg_Ge+JR6jF$P9_0i58E5u$SRdG+N(_4& zHxZNuuJJlzr}Sb}%OCED#Aa)z@zXz``@sZB;5SZ5Z1fvkw=ruy_IUj3mnpA11}vg) zi^TQ8Ca(wMeD}w1y!OJN;J1t_j=0{INwoPH`ylAY@b0|}bn1VT3; z+jX|dC+{!K?*ivN98;wTK7cosQXzCkfr}OKu5~<)c*^y*TWC4;+ub8W7<{BPL|x9N zR1!?`SNU6fwiV{R{Q1sGxyEcctmb~0E8^>JpUT)BQzXk57s>eAlYnK^zqsDL3E;`n z`vrR$$?iDkxEv3^A;pv<6rYKq=mPYar}W#Llvy}6{7~q3m{EVi5VNP4F4#bAkk9{Y zyXa<>cYA6m7~uKneKunItEnnmCM+^}dF;m!RX`foRfaacIkxJ_gs%aQPP%+{=|$vj zLBYxFZN~T-q$1IZk!;nomv65PQkLF3=iM+9ioshP?(J9(wltC8Yyn|gN4$oEfu5xP z1wLsz;p9}}a|_LU=SuyJy&l4kc#jJ9u!2_h7!RxOlX3T6U4jTj|BUuG{|u$S1E2n_ ze|AFqS3F9dZBCh0gZhJcX^qQQ0ZBekChvz@hk@BSRIf%Vkn{S^_=##uz{<&xQ#%7a z!oup>dMxS5l4t)K@F(Nb!-?>N{W@AY^;UC6w?v_riC%_Qr;WKRQ!qR52fW?Yq_#FTCc8!%sU8|u`O~!s^7S#iB!6b7 zus+07tb>nK!!ibVUT45XH8Cv>v;KDYp(ua_QN+6Csdw$LUbJ>~yb58NwiHpS;5tYj zgndpwHn%eUfdE{nRB&a+VW)g}7wEE4QCnAwa>Tz9&{MLJJ34j0gdy{BT(iCcImZ1t z#N*LxmUpLXf7S}60%2bSKHI?w<|d<6 zH^#B(x2O{$2n|EY>irSP6wE{K_Aat4$6~DJVv;{nt6?G>f_}CJL3M;W#0)MK@_=QY z2;zRW*WK;B5B!V=ovv0U`ucroN!InSxX(s8$s0Nz4Xh4!M@IXdSK2fYJ1dz`?&mu+ ziTr?B!BL7fK5Zbce{pl^xYRNyr5hU={$zPL`L$Ie5*vRUX1%qEp4ZRpFieK7e8%Br zLY5qnyVd)`$-fBJQ1V*ZZUyuL!sKn0zM|@EvS^&jHN+Q>Ks)=MZn=ckX zl3x{sY{pkb(#+!aG|*F5a33}7JZ(k4ISWln zn5n&NZ+2cTMmZ7uc5lB-?Dr4t@?4?{e_ir`a&$On*1i? zoPMJ?*1|9y%>zR zzf*oXUS$zMzll*bqm=(<6=ULFK|>>X-`cAqxlcYW#6Y?f>(4e70>xTaJIThDHfaC- zYHt&|*zw-nQOL(#i1dpWJ1Zjng`S+@ z_;?v~vc@cMC+sLA;Z4lo(_TUeiPs=7wvcdmszojp12SiKmlriR9O}M=4{cce= znlr7d^Zk7XJd~O4a~H_ss&mfWys^%=zurq&tzPj)^Fk3Vk}!Uvv}%nqhjE=1*#kds zTqHWI7(CaVrBP027wwNyJXcI~Dn2UuUaa-42E6#LZ>_Q;r8`gBYpXGYG7btaa`*0v0rtpMlB2Y_jTa0~2c+8=AD8dsBU3L;vu{4i3YxvCoh$;>6V7Ahx#kdM*V8Ry1ol zZ0$(EGHwi1{%Q&hH=)+~WwH_2=y7)RGmjo6VLCFW z)VZ$_KDLpTE+cu0a}hDQs*{1eRUsmKP{j)Za|Sq&@G}gOgE?XEwbv8d0;@@oU!9F7 zJ~qocWGPRdfrF3DrCAWoFK;rD&2=k3zhVnpbCM4QOg0O~Zip1Omlfspz=6<#FODBSAO-*a<;v7&N zfujadvfwSIY1qYGQqryW-OHMI_uhEUMNYY@9&NTQ_EE{)CPhVdJ+SScCpZ3hjgL((W2|7+4RJsW&=+Qgq z%Nq0X+#k10qH9K?UGu>`YY&Ao9AqH7XILGlUzY-v17!Nd<2ArwyJX(Rs$S}*_*zk| z<293;)4m7k6MA)v?!0%+T)om8{W+O}s`Hj+NOFPtyibodYcZtK&afgpF!GxW{zm`i zp?_=I16I$FQHxNvp0v}X`dsuzqL$bmXI`(%ip@->MJCQ^@T==cSoAu(Q%l_t-YLut zp}D5{u&t-Gmn5zr<|NxH@!2)^5j2YX1}Ys()34+&Q9Ql^*EQgO_N|ZI4O1Kg1B2Pj zM5=Y!9hxheObbcuh8Bv7VN<-BqhA$?cw?!YIHCv$7c*_SC0yEXMx$v|Lr@8fIzsML zKR-FPqjmcUiV0gm+AcRY*{^pE&evGrI^V`pRLSH_@qYOlqDm&W7p|p>iP=7uwh0Z# zq&H@s%DmJ2^9o>aOej?=-5> zsC@xy`J(}|8mTyXNrP_9KG&zJ{*PE@w2Ot-EuT*y>GiXg?kA}|?q3UX@VUNQ#f8o6 zFtZ?WF2mT2|0MhNXJRbDa?N|j5~A$w8w?{S^P&`}^9t39c%|t_MU|a>9+;r}BriX>k+0e;8YzT% zy{vGm{`Kt*;?Bo4iDrSHQ2)8+642%tgsS`ie3n8qkeM(vMoqBHM5H&MWb_nmg=@>0 zQl~LaJfipj=TbqVZCb?aNoZ-i;;Bd%phPiv&b} zEOZdOvWfU5Hnu_Ln%A>(&GuIgCSZwKc9C=|oU&z{vrgn!4tCQr3Rh>Rb)}la7bZ9c zfG15Z^&j|VkyqrE6)8U=Tyft3%Pj$WtClejc%@OqI7b(8@dz99hEmcdFef>sM5fZK zL;Z%s0sK%dpS~(uyTh^$GxO;C`|tZujMJ;rfhPs5(FH_6;|(N?jQ7?BTYK+-wArh! zq^#tgZuf?-K;JOhPR*};KVx%dFDfQJosOLE1$=JbRw}x=G1q3sKYp0b+{xqsX5SW^ z^9U3B<-hVbwZ3DEO+uI(5_hO5p&(gqz@`JxCh$5lE-p)9WmItYk2)`HZ7uf2?7XKE ziWC-+$qMXz*wbiVz0qrGZEhwR4wd3^qY|R#?HdVqIrW1jFN~+1&XtF@o5r#0pSL)y zoa~vKm^d*9a(+Y2X`~eS=Tl3go!!H!dE^2D7a~hFB&x#8AVwS>`?suipvJ^ zj|TA&P@e(!`d6a)$8X@pdHccKi8zB3I?NE~h@z954hysNb+jtnOqEGr9tzf(6?j^Y zmH5ho)a1=exh{J~Q=8m?6cK!O%4oc~cBD2C{_rl9)|h6tQteW&y`=)#gkgvJ`7W8* zD_)=+N1>&v5V02e7M=U(4TeMOmO}y8>(4QFnU`BXNto z)K?w)QtNHO^=nZM7q3y52kga=M#~~od3BJ+#w6OH=O*Y*+s;xJW88*S$P1U7nd*A< z<2YC9sOR>Y(Q3Gjpx3Jt`hy8WAR=w5{V=ILxY*?0fgYM2g|Ytf=zWvZLz22ss2)qv z@B})e)Mc^rSw3&9IS!*ny=%kpjS9cVr!T1aczGHPJo@d?XZ|;71Qh45T5!N2C8Z9b zUGB`FPk5JoxDyXmt-4bw+MnAchqsy zp?X$#GO^a><)?~a!G1+0CGcZXkm0@+zQ!kMF-RPayp9EaisKWQNXKl>35dyp(15V! zPfvoX;Uzg{Mdc>(X`)(>zP{HR8)C7{E_s?t>0sq|s4y6+s>};b{h)z~j@PB`u2eK; zT3Gl~sw-QAiH3%T2(&z???1#P#KUpzYm*Uw{5UJ*IX?c{)E%d% ztY0X1N#9d02ZFzL(DMPE^v=2WK!iQ^Ajji&v#?09{pEe(8YWx~$IXxMBu*NY5Zeg4 zyvej=4D~wKPvh<`rh3rC<|8gU^7hup%iBqUKRv@HGCaKO@fC8SFpQ>{<6A3F?|TpX zI0;xq)XOgtK!TNt4+N~dbfVF`lqE&F6=g-qVzLe?;V32kBsqRe_vrYMoHXc-;8l39 z2}#4_$WB=rSFcw}q&orFb9Y`pS0uIK@z;O}0|3=2G?_o#Locv@$FbhHWE6M!z|C$O zW4srLk#s53@)D&~sF=*Eo!c+rT#AekQw6FsmrJ$2vZ)FQgzC*pmO#*3xqt8GFUU77 z8xU3((MA1x99xfV2P!w?O+#hh%atPrSJr;onwwoe@D~L5HI7tF0JPsMD34{lv(Y=| zS?79A4%IuQvgB`~$Dln1rRO)o{ zJvH~nT9sa&@ij^RY!op2kes3t)>bWrYxF`Mf6NpR(5{E%h@mterK&_ z7<7HGyq&XchTj;5PTPcYd(xh&yV7(>P%9b#+1AU*`j}jMpG>%&BGA04lOOpexA+_w zKP)o#omu>u6Sl9){J?)1KQXAHQ$4~CYh|u4+S|NK#`!x*8o0I;BRz{ zN6p7b*W0`42XL;Nv>5T!D*_M8G9_yZ20SZ)H+n>!e|XPBFk9V{HEht1ci-ltbCE=X`#CN?IJ1*sOybKTGET;0quS3LrXesu68q28 z>G>mKq<2LkCVNuc&Ks(Xf`jW>v1Gm3BNek0LH9C?tVq(^9ZLq$21CswNq#rR5001@ z1k-9Jp%CgurlwQ%s>a#ceMb-U0Z$Ie%-N$o0CGu@(qx!W>R4JdpX>c`fe<&U)plY3 zXHx|p#6QxgW$4#HMRPOWAgAEvINfL3Gb!1WD95wCrbV5XyZOd+)ozayleKsnwY}6A zYFfs^dgaw+mU94!Z*|F9O8ua8FBa~QuN^ghs3QMuX3}2tz8DJY@$1JhAb}F(=iGsM zcEWzpx^;PzqX-CylWFZgd5)K#99JR7A*_&sH6{z#MIDJgU1;FSmaqS#V4E938dn+Cw&T>ahn}TBSSoYw0 zS8a8Kt~Ss7BT-L89Fh8|QqN7-&SCndE*#_2>dOHzsBgdkcO#Zgv%LglvYC)J2|3XJ zRRe1Qg^#)TaALJ+jUmfo0yrf|U}bc!D{Oh|scSQegEMtEkxmONded(q^MX`OV643T zMROz+p9a;2Ue(W76(&(RbDQ@9?YeSNEs0f!-O!v7hYQ>E;OJA%r)tpKj~sUWiJR23B3>t zkZv|<3@Se}MK+|_q4*uF;n3{u^@Th+$s=Qq&L7on{ ze;i+RR~~{d!da<_LKUnQtL3K=x(#!E34cCM5bpS$s zWT@+1P|HGR6APdzZ?i?cy|Q@a1@dSE2{ok$bj^Zj69)VG-qn2I_fi*r;PPvd`(5C6^UW2`M{B_oPK3#tHOuvyu;JilF@;(^mS7;`f>J7CbkWTwfkanr-#k6 ze4JskjOg!M1rTvH%M=9~`8hs4jq!ZhmcPl+Z}EhJdpY5W3i!3VEpzn0Ysv(z^2~fx z9up-GC#}eXe-K+yPrUU+xnc#tl^-=VF=b3@4q6s{KV<2o&Ok|faUOw|?IdM(#~Z4) zs20A*u%%h);1pP7ceMjLULq#EJX5_jXVRN)w#I{NAC#13WKX+E`^xv~oCvtDw>{!7 zZV6ioL<>N(z>;b8A{wH?=8J8JR|+!|%;_wK}>h-lBL><%(AuU6gmq zfzNRFF0;+sgNP$aW?;y1VsMhdG6PX7e70y2J(>q|a&mG=+^A+>cKCS_e!K0@-ZhDd zKm;j06mQzMBjY?>=jg7)J7Ku(djdc`(j?|~WE>o{oa!n4Dw3!$OO&q}=9&_Z-{sUh zB_AAI*CwGqGOll}7#MddibPd#VSNG2aH%&ncOtt0qqJan #3+~JZy0WPNpFn{C1 z0nta4jI*8Bzi$c;d9Q%ZXf-i;tembX`k5=0P{=Jel<`gh4;vn(nWU2W9gPKA;B)v- z5gu1uZNB{NYcF(kzbuJ4S|#@zP9BBd_gag$@_MKaFZr>UGq}L7dtK`zc|DH269XKI zsGo(ua~5bL_5!ktUO8VH-Sn63`g(=qjQ)Z=zDG|NC1rB0{g>Uq;3kjupuKo&$@%$l zsw?>i+TC3@QPDWEa#-?j(NXM%54STvUT4?fY;AoJy75Jz`@R{A|D8@-EuBCa56CDm zAV^yr?xk;JQr+>_Luy^I9^9{qddqCzb5c;rG)v(>w|RRWRZ-pzkwF`Gi(ooaJ0)q36Ztx}Vn?7$|)f^Qm z&mUJddi?h0!6Bo)Ps)mlm!n=&?3={Ni$J1o^m(sOqhK*3XksoLcsTbpNaU3uigxjKNCO1C7_XQQO` zTpV7%sr}HgINs3RN_#JMWviDYl%)x-&-cd~Jzv8XLIx^T{V*hw3$g^ZoJ$+acO7zWiLT-=W>7hD(OMs zeTxiksEsqrr-fHYa#q;|7J0oDExUy=2%8|5PUf?(C-ZyrGrY)nJmD6a<67+O6)LKl zTJLZw*W>1{@Ol|0R8+=#J2RJm&AL17 z3Q`G%3B*-?>P2*y5m&=%WrMi;v)SzBUx{!s)T<`kVCWZf7gDJE= zXbP@ae#!6Anlg}i>5Nh%svXVx@h4la>oD$@E5D>A`t1?kb3CmwZK&p7p>ovCg!4y0 z4eMUaLDJml1XUHB%v&DKti=B4uk=nsnmFeTEZ4p)Gl@NB><~%XCJHwb&_&*VW@eysX{Tbn2%f2V5-?xo0pBH z$|=w&1q7$CW*p`k5Ob(+(ap)3muogyv%FbzyvPS>efdhwPGBAH~05y>PWh9XZ``)>@8Pjm|b%U&rIK3q-H&y z`@@VN@qujr)kY?IZzi&KwQcLM=J{y;2Q2gPD%vOWje?zTMe!pZZ+z8EPV8Ly}bmTp#ZW-kyzH1SIXwoWXjkroVMzQ1&`{)ikOks>nOlQx7I z^HfSsA+|65>hRs*W}Xc#rq?*rtQXb&Q?RSYD6J}|+dmwvEPm=o zwh{PUG2N{O5U0sV%J?+8S>DY)Y$wNctC_D*D4(#JLVTqphk%*+4+HU$^kNJHgI)w2 z0ZWki=gCtn9gb&p1qByQxw2PwMJMn-Oh1Tsc1v}3HQxeDVf%4CedH`6rs!Sp?kc9u z618e;PsLJaKWLxc!>+BKa@``Pg-SHk%*=!pyy1pCL63zXiKmP@biFq`n zo}0R5G0$LiPUtES@0u~GFabLqh=H%?u4ih1>Cj(O1jt^#O&Ibrcr`b08(qAR@v!Y4 zT!~`@fT^+WPn!)j2Qs7XhE$R95zS7ENvYXa>pAP(Hk19`n(L{emAkwgrCQwCn%^I6 z{bYuhV-#c+6xK+iiOKy*v(Fc5jV!|^KAd+Ifva~|gq=|m+$vQ|%1rv;%!@sfC*=Dj zZryqGLKVhy`iz_P#3oOj#*m!)_l|d{+p`?uqVM$Ffq=0NQz)6OkSDqJ=X5F|Yws~c z{0a3!e)Sq9z4rV}ZzDV^9fh|EY+vpw!GCBWy$>D4X58b>dd2A=5YHh1B>ZKP^L_~_&QCfVRR@NTD z6N3(4rWpM*G7+MXCM)b}+H)VlDGzaI*8iP(w8@LF*BN(Mm%eU5sl2rKbFWNZB~64y zGL9mL6=4sw`oNU6ZhNQRTA{Z9K`m{DA9UIATB@Ksy2I^W^@W>Rh}J|_+qb$X&-8!| z8rhpW#RNXxQ}T9{G11Md<1^6m&{DR%YYRX5hu_w@RTvtw@4F!weC2n}%$XhcvZ5g? zjBO`v3MjmSvZ<)__GE}C@Ie=r@)^xuTv_dAQq&6Lb#G-XESQ&@8ZI|_Q@(wJ;d2Hh z^VJN7;Oi)+$PW*k$%KtG?KZh-(yEB|(8aDyZ+zn6!@W9JE9QE(30R+e5Y=Z}{89zs z={xT2)mN~whzw$fyGE(@?**#8Xc$g31 z@xEQ-TUcE|7wPgE?0lG)skMOk51|zX2h5MXdMU&!$01Ov#)|I(9HRI0%N*R`ws(ad z4)@5*J_TyKK5ksz(>8ToBF$7d?ap&MJQ!YZox(JIYrc79ce}$ipp&@k4Dc#-oCrFg zH+xU}k?Ayyy(oKCQTyE0TJJL{mH({<@EZk7+lGX=m>tHqIJySLkv0b7P8FnF;f*{3 zRHaHd#&P<+!=M{NfDqW(VPfX>WODZvyYHhkMZ?^Jl!qMnsd}5*zV|GrSXrnO9f)jo z>*G?-r4;35ZYnBGBodhvrWhWh>U^uYJ&DoPJEV%4IctjND-LTs2)`ccI)jHIjC{r$ltPHxYQCy(5o>To7w8HNcyWhGsIZ9fU<#+r@y3ci(-J11E|jt zqiIMBHe}E9)*Sx9)C_#OwK=~W^T6p)Zp1gq7!MqUle8R{KM`G|o4F7fx*TZ5Xa7rA z+p_+R=&pAw9g_`){vmrOR$oG5V%6Tv4)amW*2F4OCe@V-C}uz9!=#c|;g(IoDM;)j z@l}2I`%+-KRgqH)7+J`1y9>$`ZE{~NV>4DBP3yPmV1f6!-R!p&v=?1a=cOm3=bP8v zlcuz1)yo=ZW53d`@2FHv{RYa_OGL;|IegplSOWKwg*5QP2a`iy7n@(ysk!f44hXia zmj%82!Gc43i?t^cDFPQ|*ak8I(&E8V8U&`=E4PPU%&q)ZMC>^D^5^A+227cPAIRnvE$iiGuvDd)5T=v{ z8cR(n5trd?O{QQv>7^(OxBbVbE`XA#R!B8Hd=_a^;Mrh@YmJ$bgx5VGHtWVMwk5x_ zb+oHmW8WP#Nf<+;*Eu+@Gb0{&1Z}R~UeR51yzs{DWS}{AnDmqrK0@$|8R#Hy2KU|1 zsvF9C`pL@^dR%l*B^tEKs~P|#SDjDXKd1S!5H>rT0pG%|cTmUI}K=SQ}d@Rm`)qv+W$B5#yh{x@S6E_k40Tch-nuc@Nl7+yunqjWzO@ zmQ20Co+S5tZUfx~T9Cga`@TJC0`soWs_1*$=1n{D^&U`bo8Vw_7U>K$A+37AIo1;ZU z*PD=sq)-4^|CvAh4WAesk}=7ApGg`zf=HSzUo|%v4_|JH4n*ZuEEyR8;fu1oHC0fT z-!2W4Z1W_YpWd)|=$HyueD4}2Vau4Vxu5n0DGT?%M6u$`rE6Jli7q7->G!&jk~MXN zOC(uvifqY_t`BGHvCeSX?gAiz{-Gguer>E$+q*Jy{j1f6HmO`YavOHM=D2%w(w0|3 z#qqK^_&r{M>E9bcf+4$>z!~4b)BUy+1~iZ$ z#A2YGV^sk-=N?pXB27NU-BV8_4F&r%5PHp9ma*93)$CjY{wO>cAW!k}g0Z~2)6set z;Nq?%tAK)pTB4Hw-4k-u5HgUKR(?4@h9NJIaieK*Vej@%zxK*dL*J%h{>vA*DjS7v zVwOc^+ZY?gQ}=~T(L)Y&{l|i*th8Oo>4a=BB|%w8s?@Ue{8@R*2}7Xnt8^-hE% zf>Ny;_S?un8&MOF(f1;Mvfq=?`U>sG}?_H z{I->j0!mOC+#&msJILGy99Emk4`hyZo;;7J{+o8k7MF>BLbYf&EGeyYXDIsu2@ z)*-ycHYIpnlouF_^c1GQODbXR`XxCTm@v$#5T+y}+tVYB8MGP8!d*jjda<^3C^TH+ zO?R?&cx$c8s*vlWWo%$#Jzwf~j8w?z*luaqz!kn7XMb#K| zJE-sEelh^TT_T%g%giB;su>|aC^e(KIYJyskf20J$dF2GWcBk`e7uOY?*-Z zL|wX{!hlYR{JYP;6=j~|kDWTC6nciu%dX zSwpUx?)0)!DS}}8ZPKhUwgyqJ!tvccUIi4ZzT^XXl&EzB zr_N*d6^uiu2mrtqHA?ZvC&Y7^w>2kDlmrp{;WMy9R%>X0(jZi?ZuneHh1tUQxa6o2 zei82|OR0LLxS$|!BnAhB3A){JkxGdV!|*31X@rv$6?4J@l^xXg%~+_jb7SS`G57%z z-=<<-Fd`=|WOmm;!QS65d&R`W%vs_e(l993s7P%k5XQcq+GaQTcE|^cDR4tyTAd7* z;^SgiSqIpV@!dbG%r@eGq(sM~lU`jOCgR$&s+<An=&ziG(H1`8#w_VRB-k~Ha1j~HD z&7MpE>c^EID5-9v`GG`28A@vS3A_a>02H?}GutU86}2e0_lRc+z!&E8O8&7WZsm0t~Z{pW&sXg%Y_Hf8tu;7G+g zQ-hshvfKYCEI%P~*bI7}ff+ZrDu|m?q)`aw`wMdUmgBjZGpCpz0Dh&|MvqXAt9%0S z&f3gndhJt#kBYd*mNl8dY>G+9};~t0AGJdL7KC+_zj4 zr>|lUZSf}>VDRZGn)!ls?s!O<7nZcJ+&k;D?cx43TYWL6lXv1*Ma1*ksaZ7>$7#nM z7UklHHFscQMu1sKiEeFPIu2z)5r(qoY_=BWHuCs_H0AuH_Al_xVPRvxR>5Tc%36M& zn_f{U5(+?B0=N`D8l09a?b}!8W{5js%sL*7}WH5VNM;#&i zQ&lSmh4NGZ3>3nSJ8+?TJE(nVBGgR5s z@@`;j2GZU~G3BcTSed}*o~!ld;-WrF^53h$m9)y)?tvI4LBTrmvhvLy56y2fs4WO} z_Df8qZRN|Y`qSOGNq1$0{A@;IgFzwQ->@+v`OuKU^W$=-e|)Vd=#b=et07h|Gj(Ok zp$)IpI%d`}Em=AdJA|FwsOu2!_f2ZXb=t9Ffu;%p?p)1E;!70q>N+l>I~Er!;Gi}xUA-FZ&Ihh zFOy_3U1te4hsNDEmFkh-H(%TWlmpef1(Rmn!Fd@krj7M|{bII;a9FRp7>epk0Ou}> zt2uo)`67^!9BVHmSZ7tl*h{za@ndg!{S#~-P=VGyWbwSr#xOZ;jKC&9SCA~xt>`VJ zyvunycNsOx!fw=%>*PBp%Ibo_*4{hsPM}>=-K%J07u1lA@6OxmJ*#HYb*Tr)=7Wra z4jY|PglQ6-`e8bimfOiT4_!L*xwmGDl3=>uG4$24((N^6+D?5b0 zL$1QYs4#ONi~IfX#7<%$L2-{xxP3#M|9+i7VSYl$b*Hhm=KE0Zkk%XEnBwHK!NI{e ze^49{VQ{a_5@thHJ9D8VSlJO0IwGBE5EEK2m!?`=b!|4RhQGg#X6out@731Q87o6h zky!9>4<32{h+Pm398&t)!=ykq`KtqClT?QurE_R833`nm$L~iom-hDbtJ<5`3FmwY z*v}!ohLXBMlRAIB2ppH1)?{`adc!`(?v+qx+S_$}Zn<3nUl8(H zfi7M#YBi!+$ArAIh2H+r2-YLl*zYVRSc79=SZz&VZi&wOpzm~2+!Bvj{Wp7)s%bj` z8w{z*(H^(bDfleTm(`)#WjR^BT7*F&tvxX16&37kY#~Gt?bDbLd{$PxBP*#$xW11c zwl|3Cz*T0MlA_|I7(6^YUa7-pHLf-#yi=106ZOSDJ~~Pj+<4-Z0&hNjq;^)d^zdez z6O4;b;GLay<>k4lZHQ&kuFNR06^>^ca&}I5A)}xqgpWg2JlozIsK|~c_3fFLOh!9W zcxcwb*nbAKv2danoo{8Q(ENcLnN|AC#`Zm5R6D?|knR_@pwuP`)Va|&t~IkR&s9Rr;S@w^r} zj#L>!v*p;9gYO!*JI5Px!aky)EIsY{LAv?J$H!|HPNm5t3SJ~Sr&Cazn<<~K0`sSm z_c@P1S6hCVa8I?MBLky*KAwhc^@;G}EOz{$o9L)B3_a}3>6={ImN3!@G713&O^*@r z2I~eCB%X?^A(X%tS~c4+jh`WRM>PPVx2^Pm=>Q&XN+j2cN4)Pn)@%GIY>?QRttS^< zPz*NnB>?K3Nihj#5U%Ywi;Rp#evs0+nytS2I3-K4%7p)>*vj58pFe-ruCk;=wmUq6 zsmyHrCIJbljUyc(cd!Kd1fKI(;x-Rn&@HV-VuX4x~)BQK-4 zi8M1a_fA_2{-m(*V58#$d4Vu+h>4`3wY7}nK6b4vy98ybQNo^?Nl_bW!to-Qzwi$0 z#5z({BS?Y{#*z%S3Su5RD%%{6cdSb-n{0^ir5l&=|9R+n2H!Utce~HQ`RFJ8bqepzJ`r30{ z)ez{UR8_@1QNLLv3Fyz|wl=k~IsVlW)je*$rmoGaDzqp)@*PB#yYvwZmefXcW0zEz znVZ)RurdCy@ZR%>5Ea!HAQ&!<+tM-hQ`+qDR)!)5AvQ#8l4d-2kFjsAYl!ui*6gNp z!vA=giHL9U`iwXUZnx)osi~>oCZ9UWP5QgS9)59mUTpSTjDsGIxxnv~VRo)Rr<`pE zOKBE-oK;wMrwHp&qz0g2)>JHPx*tnountQt;Yu;r@7jB(G{*VJI%Ja%gIq1kZD}PWq$?GvM zy(}D2k%`1}H>Po9LN}4Y2}{?UZ{cV8R+M2J=*EyE)N%5Rj*m)yxNdU-pYXf62X|rL z9ieanbGuB9jAU)0rOaHoQEh>q4%u5xX8k&w(KVB}N1EiU_<_I^)`89Q=1j@wdcww! z>yJl6F-unGVNrUFjB_$FMpvX|8Mdc|LZ@wHPp>o|76(C`6cmh%eh-PFm_k*#@%}Z; z0?l5B64y;Vno8?NuXbtsd!&bZhk&+}%Jk`);2f#dwaWT5ppWG-dJibJo7CsW9E6YG z>LL$Ss4NZ>iU!3Nrp5GKltv~KdM(s@D%y1Kyy)dnc#NRrxessTeqAC0kc9p{;37-& zhYTBfS&4r4NZjq$JnCElG=JnTrsbnWB#)doy4vq^P!IHWssJE)_O8__Zj4H}1L~;h z)45Wem3m#@&#|Aqr|YQ{GD2*!N!v^CFc z!}h+pnCk0`eB8XsMbq55d|NBKjPZOA(jG@uarPVDD~lpqwjCgN22xfUa^);>gj=a# z6kn?nes?^4!u`!x#|*rlKbw@J>oGW@b({8aHjd-&59f5~*{g!HdvxZkJ|yQKu#U+h z-1t*9&{gs88Qew0n^{^&Fh(DsKK#OkR+o;vb-wl2{u(F?+(Q9=WMGV{$c=Qi;(cxs zRmxH=D8p31^{Lvy4uoAi>H0Z4g^$* z$7~bg#pQ3YGU*1G>pP^$iewFO^(0v)x!RxX$6k?YyNfF-pF|At$Zij5_66Sxamjw`^G*8WO z|Gv}{%p*-+4N!ba0jNiO)y(f6OlnWVk}yFKVl9eL;0PU zf2AKWHgG#(gCc-m#OiGsXtje2OA!-R@d4`672QmMCHdpI;1djjSu2GF#4$9QT*!;P z$-Bq9un znyY9|m@XdcjiXuJFv>`5L>ejDNa>3~0T0~hZS%z?edh_g#9Wf!qO7D=2Ke%Hoc(@! zb<`ZL*Qw)Qx-Q3hP47iXzj=F;AQCOQQBm@!FO)5Pu~U70DNFzvgfMaY7u6WM0V9h% z4c`YKfZn-+ze-6UOxp4}nhN*FfWCU8i2wvgGN0tOw)ojuNSG!uSVNJpRv&0+EL`*R zWiJ)`Su@*g8qX?`_bQd|c7f)}u>+;_D_q56e6+hS6~Q^bfB(F((=u9DTc(4oew|#P z<}OPTiD(}}W?Pn^WBlq`-&h&GCfK^K3vYz`P6ICfB>F!nN;aK?}EvE_CcpTNSd9(gPpEBRK@VOmC znkI@LqOs8U08F)@JDvI@{V1iqz~8F`2)_*tkH0wl6PXeTe+vm%he(6~Qj)3E$}uG| zW6z0hs!(&bM}futd4IqtwGK>xK6dk~k_<04-E9mee&&7~0uL#R^s%?$G8-7ofbHQC zU5tYJ%;RAgx$Czb92kJ{iQAbb&$&X>di}d@`2zSlz(DndJ;v2vIt#YQHZ=uaI|1#Q zWnUDt>K{$?&&v8&OO*E|jO=&XDFBYn@9U+0XU>==`=U#??12cFr)1Xie zh!1?XfKm3~qYe}DU})$NErYYZMXh*J!LJ4Lx!@vL^-z3hV}C#t%B3)0l@81-Ovpn{ z`uq}s-eFAO@mu3yxEPknQx3*}V)v75R>)u9_J%Pxe3&2+kXI07C|xEcARxil^7ao& z7ZLXK*}S_9M7pwBm>KSudtPc@^q!tRS$P>5X~nR+W1*9x2F2SilAE|Y7o8k#aJcFB zK&8Yk%?Fr&_s54)1LYoj_b16!?K!3ot+}K-PomHWg9rxkK3gAE*#G}k`?G*M@~xeN zyTPmi9fRx@0UfC1C=l!DpDtdV{z@7D`H7K%QBr}J>_f!w<@WhZty~$WdCbfg*nM$# zdiZx9HOMxW zn&ePP;M-js?eG@MGM|@?7i-4pNcYfwJ^$kP_;lHemX?NXEx}KZ3(;(Vw62S*x+6eTKh+Xy1?N;LS~b(=m)ae%>1f zNsuku0(}O*Y6Xi)hPq^YQ~)E2#zNv_Tkm{f=YY}WwlbEL^JYYNMfm4C7|&j}?`<0- z0z@A_hG0Xj0}`2_8Yy4L1wudgNdYg1yPtUiqBsbD9u?4972A~5H?>-|9ZRDotvz+X zzZ8&wHOux>5&Dlc`)#CuCK*<0HH_TPr9KVx^&D5bV^e*t6I|cUW@AXhv(WV)+uX`0z?-1&c46b^<&YxX}ZC75Eywp&` z!pl2iFnYB3d@a26)-tQ=a*Hb+zvFLm<2kg z@HxnkN|rzgPa zW9+p!uBU%mjYpG$gzO^wmYjUnzp(5VF*l>F^l0cCm>2k$$3-j^?%$i_n=YV6{&jt^ zk9)NCow^MQVI4zyctE$*F^9#}%L!>RgeXUgSPihX1emf zGyN_KyS0%U!q_ADbz+WeGhjf`v!XuBMtKDWHtgtqv4e$SoE6Sfd_p7@uq(Y!;1BNT zcYt<@pP!_Zh*$I)OaGWbfDqo;TGIr+0ejBeI4oTrf(k`^NI7BQA>OZCrKJI9Jdz7G z*y<}$Q$K3~h)KEOzBWn{vD^4q?tzm5)dgTBYmfP`$MYzYm;_05nz7Ksf&ghd|%Sg z2`>+YBv(%@_ETT|oY?3Pni^26#Q6Sf?_WR|nIsV!PDuG~_zME_4!Nk~@hzOmc$tPwWWn21O6v$f!pn0Ier3*RT z-%k$+b`8iNATHoLy1KeLIvVAAT+i5|nvVEQHpTjsHY`blI0Zdm(jMHow%>H>maEsUZJ3=Db#L3@>al5=lj@bPvr!wvtouYtaGDQdS{Gc~a; z17Ic6@SP;w`Jd5>=QyJE4NQ_A(Qc}J7%|{I5Z=wdTg`?qfrd9%Gg$8mHSP|>PN;2m z=b^GgB9PvR_HLj!3OL_G5HYJ{6z&F(2wB$zU6L8dL(0W6C39M790e2=T0imMcEw4i zniXEYWIAV7g2(@dt#yZFvUz`sEl!Jq2y9MRdlk;V8`yu@r~mhlfGhgZG4sQBCPchK zifamF^dawU7FNCP#ZMr#s3*7lcgyzApy{^~{duDQ)5lz?m0w3yI@80wMg4t!bc~D& z0vCOH!FdamD*uTE{1uS>pDy^%7xIB*@*YP$hKZRe zDw@lK#LoU)#Q*Pa#6b=)1QV1clK*De|NFN?$r+Zkn|C?ZxSD@vQ z76Mqv{@XwQA71_YGmV>a)eirEwd`ptKu9Ih=BCK{>VNe--jUBC=xZV-po!MCYJPQp&msH0<5l0hP>;rvRFi+oG3 zQMs#kwrz4z>tDbAkGHeL`eo;k;iX|?LrK7;W?c1Lqx{Rt36leQq8xza_m|@Q;~#i= zF9uaxoDK!qulqRf8H*U=$;!brcK5Bbq%C@s{~ujf9TxSrwT~#Mpp=A&fP!>MH$w<0 z2uOE#cgLWJba$6@=g=YD-92=7!_4>N96jfF@BRManFof6y;r>JUGHAoHs3C*U1lv9 ztF-hPaP`4#^`^%s{mHKl^$i;ox{>eez;O5VOi#Sc*y@SOt|o8IxjG~=ndT^X(}o<*tmtAQ9U@cD}V z)lQ^+^oBeG7`PH+l*RP$$ zV2gcxg+fzfZG@kR4u~hfOfVP>M!;$Ne!!I|JOmh=t!LWO;rMHBfb*t``kyqDM4vqP zX9w@VxCavAyl}hgaohRF`zoGMxQBf9v?o;kmNb`Km=2c25>nSI~q9C-?k zZT7vt4#@xcp9^HXfH^nxdd$ZhXTh#knvZ9B74qcn^*s5Sg?5K-79MP{A%JCU4~~RY zxbSx!5+j4YTVDToOhuG zE3FYX!b^Lb%C%Z7HO<*lkTBmu^5x-Sgc_CF=dQb(p191$UA*xsF@SKUhQCa?Tz#~y z??Y+8_?Pm+BhZTfgn&6YDcSl&VZ65(9phZj=1Yznsdsco;nxGlLac1m3K_PVWy%Fy zEI+F8E_n)o=5JGj+jV;o0rko5)zPXvjv^C@BY>~|@?VdVM<>Aeg6yiDGPdJeok2yK zw|`9~yz@q-KYE@#xttPQ__aa5oF|#Db2G2J5!StXj74yDopAopo)C@4LFa7P0;_&J zrN_WvU6ljgC=NAxMV3i+@3Y*>%b|D$i)4@!C6@iUHLX--Rfrt%;hxz}5eh4%RVpg| zRF!pfaXX*)Tu9^Q@D`$chxAPmJ$@hEt#){6y+a|bEz&k0kC2*BMd3y|AM=z;;o z^zKgn4!}_X!)UaJXP5lcO$JwGWQNVvZY{Z^BP;aAJdUh`HkJ~z0QHa857M5i8kd?! zO1%7BBNlKsc+k`9XVCBqk=XZu@T^&ld?Q(7Q`@uS6x9@`32RUyDrgfpYP_vC_e zmkd`R%qT0zRB^;9ZD>#&18--VWhiIJkuFFozQ8@!BK$5rW_X&G_Zq@lFq<((HuU^1!QIDo zJX={E-Urvdur326R6^d?%fpGJ3q@H3+xU>&^o@#d>Pha)y88N$)3cTHzGOsZghVEO z+Iar8Rx5bzqvF}oCH|D%QSdjAGlSVmFd&*B0=f7k*3)Y635Mq9=d<|3fGDg9j1UtL z4_OC+2|4UGethLyM{+ws#D4cHWx;N*7_?6iOwht{||9?qZ(-PrxU(GNG&EWW3q@86N zqZM%2x4B+_Q0wpf_gD58Cgi%{5NBmc?pDBqh729FeRIhb<3X9~s^N|~EgfK^>ielGJ0I7!Mzw~>*p zvG-T*jea4Bh?)E~?*}4IPu{zFV-S3g zGu3AeuQ4z{DlLy3O-+$qM^02a7jpt9MvRWK)}4`qVf8G~`->-VRABW5Oh7(?-Q@<; ztR#8C{uDS(-$3eL<6Nbv7&Ed0Q=m^H)~LtUp!=T8?Fc(eVLE0_E>2#&D*ZK09KTU*tIx`awLM#@dXjL*h+8dCO(~xNT z4)w}~6Qn5rm-P>R`Ga?A2!S!~7nJ&Q2k)M_if9-WX>IZ(yQ`S`Jis-O-_;cAITMWN z7FREuYK-@qn#<~X+U-!c)jCp)gLA5<;oXtcG=o#0BzVvXD=c`+;4SgZ>4b)B(JZ%w zTA<-vSOLJ&7vzmc8UoWt*cMz5-A1WXjMnk9Y0ub5Ew4$*^hgF*c~%g_>Or0rh@*5#t4E!7Yov9Jd*9!T&^&|nSiqZ z(5-;zEZ>90y1`aR6LZTAZ*66mWEyDgfYIh!o+~fzD>$qd-cB=H;48Lv$jst!%vQNZ zN3kSvI3M1+oT&zA;F=!MZmNk@92d3r9d0TNkTeoCegx#e!2NNYr^ae!>ZEH^zCnJe zdaPWmJHgJ!K{+An*muXo)}Jx~SEa^ykH)B#Xodstq&z=ZVG+N!i{OuQjL9aFj#BGt zv9AFTW~fV{3b*B6A=|8luN#1t1;8w9|5=BsEHHl889(VpPyBCTHDJ-s#3{|eS*N;R zH<12oE1L`vfy@3v!;iI)6a#A_`+8Rs-vLOeW>OrcgWjqz6#59IQB66Tip`_rlT1(#T!h4zZw^>QgH6AwF{@_9N6+jRiRZJ#|pzo zPULbZji6x>_Q#e=Co?BbjS~4I#V?p{6A4cbtallTTh;`{xK0e4Ro_nmqXzeUv>E-KTtXIxNnVbKF}2sKQV(3%<7y#Z8xTT=m9M2bC;BbW5<*OSXaHQ^TVZS28oW@*nEwnC^ZKWX{P)oH z=ej=k1IWbZ-ArEY|DCb1aDYh*uNyM$TXLcKJGL8B+!IvH`1BBcPBAU$V)a9Z#E`iV zbsp!Nrs|U90Y5j~B8B>}q1dlg-#-YtPeRnRgHi(WY_HLGDY!Ei92kbIYIc#7|0-_N zF*VIiv6Mkws?<(0wy+iK>9yq7r&L?8?Me=oZ8?4D$MNa*EntKvK*=_@Qr0d_Bz}0@ z<*Iu6s6bGy-Vmix3AWx+D;~!+JKisB|4a^{yJ(UXQtXgahPr{t$*DQBf32&Lcao0l zS)@_hX&$sEHEGs1&H=E&orUAw)5fy_^o=*UY#OFOx9PE!ud6C;=$JojF%*P9?9_b| z3TXXGy`IHmszl+8X9+}K) zoaf-3xrdIjZLk9?X|{TAbZ76cLClm?nl0b_kfpI)`14b~c*vrB!Q%SjflKLUTm4?d zd81wsA=NtyX>;H480;O+(Nj8QTDrR(-oeFW&~3pRPaXc&Mwo+YaIlxKZ7(DJAyjX* zY0)Fbc}#CSp*6|up~mLP(3$v_{9Bvp>F*Ny!fTUS;ona2qOC|;$nGh=-IyovesDDY zw3gP;B%CZZR%uz}BjHhV4E_WG(iGXJCw_k*JbI<6cJJ`vV1Jcc=>S-x3v8ek%aZ5= zips8Ee!p``r$MQ_uyRhhHSbJMKY+2|{#=e#T4Tzv>)3*KY%evco5YH#DQ${6LLg7K zP2~)2k0aC8?)*Njx@&Ejv}2y>CgTz`3dzEPKI2E-LX@t)E~S@z7p#!0X>oIv<7TGe zN(I-PMfH3lscL_SwOhuDM}Q%5au%{Y*pTQQxuguSxH@uCi~OOYN0Rn~_Zmjx*{qWD z7ThlVP`Q+B0&rG_x{ z?O*qk8*0=|uk<_b5w5{dUz3o?)Im-Y*O5%lS7JE|3vvl2-Buzchz89PV*@&z%7TI|qFI-niY(qFrrpMj*6Z-ghkt&i&zM-~Dq9ugvs=vBPBXY& zJsXwzbCe^*L-`eXQ|Ejg@ilK|{OETMUtr0wYrpkkin_fJjc6_7G=AE7cUK zjRNJZj~MnmYy~l6(6-XJ&rkB)RWT<#ETPhA7d4}JW9rx-{A68$%f7luJ~7v8tIh$Q zbR`MCOgr`3YVGZYC)O(GY?>Xo8#MWwi{^Bhu5PVzHC|*(lhU%dEpP3LzE=8Q%Ln|f zsRm)q9HV$I`{dg{tM~I!8!bAdvPll9PdVNt^7A7abG6Tv;(g||Mtx_vMB7X{(3{y= zRlP4sNra-lJi0Q48Gd3d=CKt{*lSk^;M z9+Y5>rR$au@#luI7xK;4d=WvMe$GKzR-rokfD39=xsVefx>TebZyaIcr zM`?OtPD%9si^Np+*Tr`AYcZeW4{^__%+U6^=GRx3h#FySo8jvbBELT=+1>x{=`}!@M9Wq=qmDv%+O^qFSHfIUR9Qo{>u4P{ZdU9&<>DdWY zN(OR#91Lk0hvcS1!N+Umw6ck6DyS5vCgVBMnX2EV4J9hu=A2igJJ6DuvkETlFrW3S z*EkC0Sm2eigNYl`BMS2gi-$q~wnkIx;>*EQN%z~+^!-NgCK#@(t2@a`C@k{J+f`+n zMp&}?@-cq*e^I_y!1(-@7D5e8i__|L-SsLL=PIT81!CKnBxkpUAi>i6p3SxV;fg?% zLw|@=s8)SBJe$ixCH&4S)|~Nn-3b&NtA6mbj7??Kr+`s+=Q?$%na$rcQB<;az8;uV zK(Eg7!pA$Sqi5fC^JrM-SwoU}`4@5Iwp1UY#<(ca9v-auy(Ya5O(zFU>$^D+|<0};xe^c8G`R-Ld2yj-h>XjGut zn)d#N!MI>Pe|W&lCK?JH_LLCWnJmOpY3ogLXx|;^?F@_DX3bq)-={8Pd_69~n3(Ki z_DPVEfk~xW*~zWX-++D-#b4U`J#O(v0R;omq?;^$@Y@Off-u%^sw1UI)Cz3hOm}0@ z_T?I>-<3y^D9|@5lvU!Ysj)}vPMsKe{bTCCJ&K`hVz|$gn*ASP!4m(TRkr5#)m;3< z9(UFH#^zTWN&sH1uBxjiL9w&2bE67IiuHRc#f>-T?TpK14$UPl?JsSz=&9_it*wIz z0~C;{W$6IbR)2qY5Gaa*NUi$2IdNl>yWm2la)GG1Ib#2#1Wqe5AUvOu*!RfgsPpR! zK~14va$>A|0W-K@=EI{l>ZvIX^F8zC*PmCZggQp^#_Pn>+iACyD?8{ejdR@xLM8|0 zD+MB0J!Uc?X452TVPh|t12K3RO*k4^L{x+4UCmd%HrN0~o`5@ec~qexOgei?pzm2&GVVCur{$ zDZYc*RI=sGN#u7EZGMU?43*0^@RrW@grU1bI+D2dJKsv**iGu_!c7e%qUS3_a+Q|! z#S5k8S@x}D1urO2K)W)e1+!1bdMD>TL1n5e<>w9v)EX?Q**FR{JVJGzOcbd&VUUh* zS-0N2<4e)_Wa%Mos6S5MB$-1Q^n!|1sMl|q`iclSttYxNUj?jVAe7%AEk3Gtu)4jj zb6BEs%hrBsOHHEk?F03{bJSa^x7I72MGaR}Y(=v6_DPW7eM#_a9mRm58}i?g;2I0a z|8DMd?|!#p_Arl0RTl7CX>2&o8@0h*_nvMX{K2+p_RnumYo}PWbGa9`bH;b$7Q)Iv^VXUyf&7cDaDnLb#mOkzyzgTiTWIZIof$|-i#eiEJivQggCOU<33`-i1( zXc%?-LC=oLw7nP5NZ@78T}X$@+DiOyj;xOh)g^E=ps1gx8tHI~;J93k$s1Q5lYvKciWKUzdiJrKVW;S>geWbhWY#T~9Pb|?n>e}~8YL#j3>%*_26)`Ci0dgym%Z=aF_t~TH_P+d)>~Rzep1JROZW(c`)RjBi z6SULF>;0;u3NaN>t`ZCl(<^Yrh{$xesPjxakElvXu+w2wj%Wp%241QgQ$0HYgHv<% z4X1%b68y|R2VcQL4z<~r2|GHx(64At1+Di+@AD+e{AybVp?gie!D|Z+MMjHL$#i!a zX!UvM`27ZF6-wsBKoyn6o9{#E2#_s)4i_U^sW%dyKKK2g@4m7(Q^4Fdp>kG8EHpoU zh$Xao$zEw9i)HQeQyRHhjQ9*|MTeQUt47SQ4v&o5)sE3KKUjqekNx)x;T--g zz1?-ay_p$!{FOUji+wRvD(eMbQeD6op2OQ!7rh@oqYdO42J0x^*8Z3*j(Ghq-00zj zu(`SU;RD=(crGM0sjsqFChqUQlhS?C#1--l?)UhQOpi+cN_~V2nr`uo+2K)4%y8&^ za-_ObxjcJuUf50_pO{9BaT*-z{^o1}ZJUdCHSUU6^S0^(oGY+cbB(ct1UQQ|eTCen zrrqD;5ZE3PmSC@Wj&a%9G4Tl=B+x;G`Qil1l}O}NQ?F-1AC(@h3+;E=+c-V6S)~?= zUP%$c6Prys6xcT)+;&Z_VoamBjnJLJabVdR+jp`$fBylsTM_uUe7=u;)J~JnuO(ES zbo+gvs@loX-xkgd?5xgUx!nD`lp5-fU#Lq^l2fm|E6Pay3~HNp@lvnS$M>sF)+x`|uPTm) z{0nmKANnS9q1*D3C@O`)Q&5Mq1!F@ioYCA)->c90gk|g+v<)ZsmE@%nI-?MU|Ly;N z9~Qf!Xt|h;v>!b!xY$V7egp#914HSU=kAi|9hgx=p^3TdOE5~Gn-z0%s=|BnZ6qu3 z2BBJ!yote6do{!6-_eAxsSB|cwQK+cqhlf|`*f_@Xl=5;=h=SC$lAD2j#%ABJPG|Ucx2DM;N@=W(rh57C2DxtZV{(cE6pE4pz`~AVw z)u4O)Fni2lvzBZ*$aLO5kBfI#wEaTmGpajCd`44zaK5VJElxGc$^X${>PLm|<8@&; zW7ELGM3O;PYo+FRDT5%HALN31m8=v%TCT@al2>l?d1l^GWVuSWLohWFX_;YN2sFX- zvbHNyFdk}8X(h^UFZc0r3FgSTYK}37TWVs2&oKl#T7vmehznKmYc7uyF$u&#*D7|V zxn4vs`|>fDhEBT}3!}5QDk;pB1I!7H;qMyo z*Fhq+==b@}Nj$yI%<0^Th-ozKVGh#yl=@YfxdfpT>B!HwIn&~Ge0cYzF{B8+^k^#?|q&{ z3vbq@qB$$8YD3d+lIPa69{_6~MDf<5efV)YxQxcy%6Bk2`Q|Nv45L`+FXc_5I|2!e zR{}mqM;j>W`mnoL0wL-)>_cm8Y$!|qz{Tur-5DB$s-(D0PIkJa7f9Ul-CuG)nN@hk zB?D8-)THQ_j?YQf?RW{DN)*Hv?G!M+@8glXzENKp9ZuxBR2CH*z-b>7)N_sv8y zbu_(dUj5U>g_0xDKq7`}*NhFaf-P!6Tp4+%OY^evr+q|*Jv9imhjDQeXNLwvj>j#8 zNJRK`{LMExMkanLxMY3Txd3*Y7T}Wf=VDNIYs{rJ1Lp2ktnG^f&=%dFqOGH^Pp!6? zBiwQ6BK3h8!C}V~fy4Mnl#eZpsN>#e_vuOzw008HK?d14J zc!*U;Vlw1;*@xpAJfTaf{r;82y{cG6_fTeRUj`IT- zF!=UxU?Gd9dt~Q*PZ5*NFhY9A6Qm$#5qsjjkI%1~*<-OEgTTp43uj|do#pK7DQ?VG zJH1ArzEDY_G|^M!Y2;}Uvf<5~iMHr`bSchqc;|dz{6r)v@oO@=`Dwa_q^-ljL9}+N z4*b^4nt>~~>+Lj#%Z}IOPYaT8u5kfKOJ93>aA4?a@cGj!i5s*vG#skxO@m8h;?+IG zXhJNNA&mG;a`zw<(%e(h|NN2Q;hc$g7?}WOQN>!o=f^~ZuRT1Mx-Pr?;axedW~w-hsNiL34mlWAjZtP)e1{wF@qVxR|}7t>|Ci^UDg^akUQ-)54H2U*Ksxt&@cE!}qL34*{HRyYFs;^b$0>=8-!HkS%z5^`4Bt+wIByx8l? ztz(SW~jabu~a!r&cL`aN>c?=8TgNCvGDfX=y=~$uEvA+T@N&yDZ(2s)vCt@ zZHXm>wk!}!QjVd9#B>6&DYg6(PfmJKf^j-5PyUmM{I%SMh*f=jdHgSa^DDlTK?F3= zSmD6n#EHnGh@e(#0S61N;*DTz(aW>GYbvgOuIUyMK@ss6=t^X&v0h{hhHA2XgR^p!RwSF+l9r&^qewl;QNJAY_ z(bd&OaJU4aoRE$zX`M={-80F=3i}_4?N25+pRUOaNuE4I@OPZ&^&odN`(Q66x^Fi6 zcoskL#Nv#{dGYIgC%2o+_>6Rt-pA~3XtEd9>m%ZTNPKKSD~DW=G^f4U<6xmWFj>Ew z5+E`w{#yNty6nwQ!nx@yUwwOphEq|VFA%bLTzCrl*0O>LeI{_H4SZ|8ubZUyeQFVV z9~XU`>O4nkOdG|T8Hw&yOUF`Xqn0rDF49yw+Z zDe69#`h)zFSy+{P>|4a#0Ax%B|yxXgKdqPgyp6;fCcHZLvOG(g{*H_;c zS4q_>W{J;IG*Y@t(7glWGKh3XixQ7xZhm@pukZS>`h?*8OJ>U+X3U zxi8keqd5Nq!cXg0-yZ3ko9p}NZCpVmmn|)y<9fBik2hXA17-BnB&3f)RYIW8}CX%cNCyC|c&rZ%gw}%2ZVI(D@(cC-(qA1M^Oj%L0A! zjV&MRXevMPOcq;DC=k)_Y$Q5!e`l_?JR832c?eMJFOOq8nplL(pY|;rJ%^ZQv4-gv zpjc%J%!K^O*ZzS}2wWNG>{SzJjBPwFj3e|RsD#4W#=+%mHOMP2KRaIoi-6}jmzbjlp|sd&x(3}MUjavUO~X^u-SLuzo?PvAu}upo?PXs$JGne1ymr?F=oxdgDG>a4 z!B@v%i^OQG@I!Po;YjpM-Ma+V&0}bBnJ;d{`_?&>WcOQhOT(1AXFD;$V4K?uC!yt4 zwNem;q{07xU9i0W=y>8M7|HuP>WoH5FFJE@pr0LxT&0STG8~p3j>Np~mCu#u%~U`A z(LSE7lo(R{Dno8|8lmSUaHl3!SlZZZbL~`2B5jdsPKTIACf#mBEqIgjiSDY}Fw zsnQcm^2y1c-MD{Phs6*C7M(mC*1ab5)5Lr{yjZW@R`l}3_4#lKSr0Ejk@ zhXAiJSz6fCCgW9u-)%c^w0YPkL&P9`v7eU9lox-N8Jx5_O{3xDPq1O*PfJeE2O3$1VUq z+fs2M_R;qF`UUvLWtk%?<2ctroFn%Ru>Oj=9oWVFMB7{2J-b6KeFd_SFQVxDawFt2 zZCMWm?CmSFvQFVi0vsTvY@yKMN+T^C+3p8{`T)*`cd2OjBBsJN{Vg2a_W~uZtY@MC z883JUUJI106W*`)7|;RUJA%3ZtUD9NGlP1z&%(H>oXZ=q?!t5WLuKG@0m2d^d90g) z#c*%Z(MWPlzw^?;M<*KAkWB@X@B#2V;JIUV>XA^Y1Ri zWR}M%*v%sO_zl&+{`D?X=b>C&T`Sj1UhfRHwA9<$e$^`xdhP<*ZnKHbFC!|^{Ad-` z;Ht1gID-L^v?tqRFdWYP+#B|*r!<04SVn=2`t+#G7V$Y@PQw8&8{7G?S02>vBGZZ1 zF?aU;V0lF~3VsR0ZBtUZ^*hgC2cV2)R5@ua^)e9wSY4X>h{D#b?Y`h7m#a(fNb7h5 z#7>X-^wyzAZf^j-2%p7BQ&%O)}Ee#G0>*Rc~L6eJc$r#>R)&M)7X`Tw|= zg~1*Z!|pW2$dTCW`Ov19&-kv~^?o*4snxY&0jhkp6H?6B(x|4*FBo^1!NxHG+Pvb; z=3fo$Ux?I_8g6Xaq|`dld!5p~?|J{^X?S40vYbZ2CsKgjCFDV;kH5IQ9yOB{ zyQ@`9L5=`hkS|jv5edZZe_~H2qN;(EY=li1L=h`P@ivD4M@j3kLD-524hFV-hQ839 zo7HGJm`fDqkEDFaZ`Mpa*IQd%6~A>q8{OCU|GEPLEiG*xE)`D@Io_CO%&twfel5G5 zzc?E8Wx9H=y`=z$cDOgEe!dc6rNOAY3rsy13xTg(s=#ko#=JQz%CsDXy?Jf6TpQUM z-m}{)!^`F8j5T1R!Ftu|-~KYWhnQ8Nv9Xp*Ul8wIAN5QH=S7}>|^X$H#g%1GAju8FI zDHtOL(D{0xP9g)a3ef^ziqb%S0WEFTGNpCn#iyg&+chv?04odlbz5EuGB zKRMelQII*&^4YIX)d3xxLT)e`-+=)YtZns)bW z*j_5SX>z{%i6wuzW0TZ_mr(3Bt8L_$*c79MRcYmR5on?8KjA>i1_HGHT4?`!mCL7o zNmiEYoA2x;)ftOlx~UB40V;;@U|iwh`O6VWZW`j1vVBmlT2b7c)8O=~vbwj@1Nraj zEumA*t&696C1n3=4{j0a#_bruc%IzzpZVi~wW;TeE0IcCrIQZ7k`^-H$nti}#(}Pi z)%h2}n%?H0+q8R0#XRwB);rCh< zN_t5)3l-Jtc82@47rd_?{rb8;zr732+nToSUJoXwKvmU6vmLVreS5=lWpnfSsMFet z;NAu7Xj|CPAqlyj3jB{o&ZoxRG>?D11RwIs^BD)ad<*p)yqgxXEsFk%f zclc^i8CWepnBR2U320N!X}Ag8*+>T}>50p74+@|q(UQ%VKJ!n$iY!3S0PD73me|6L zM$47Lakq5PJLyl{*MNr7CXv*?(65KsWA35Sj0?uE-U^$oN0S-6PGrC6VZ;Z7yEjo6 zeoqcfWdrHbr+D#cEuMkW@${t`HYwr>D0w;yuG@ zZhYh3m4npx%Pgb3Z*d8vAFQ-Sv-VxxfZ)_{2#|L8jC;2peBis)@ezE{auuKg(e~#> z@@&3d9&oAHwamJ+uuAhuo{hvIfRH-XVm4Mc*tp>R%zx%Z(*P22z37X}h(EpOpWaCn z=$L-nQg9kf0mR*38~EEJQFS0F{1^ZI2LdYDEBk}>)m0^X`s3B_c~?`y!d`ccdt%lS zqj)wmH6JxAQI+E&{=uO4zeoaA{>81;_>LDSTpG7&oqcn(b+(28)K>$Un>D8+6yV1c z7P-{`zLTpIFHk;q7M`qLX}KGH+sNijIi8-l@8-}Xc4l8J^D+nB^;FA0VJo6}w&@Ex2FPvXPLGzUKd^Wq7`qL`aCriTgwNkI!i+vQ%nUi9rOk8{TkR zMGxU;Ki8;O<+JG67Vq^DAIgv8W3=)VZ1KBDNsT-?L9bZA&QK%p+adxjOl1UZu;TD# z51`JW-FVBelrY>qu)xO5tw>3uJq}GrsOG4iVci=Fn$Le*dszPT1Ib?@%VPuus1s?I zg}*g-Xf}U*_NB%U7h87Gc2D(hOo*f&|5OA4>v1jEn z+d4Ra>n+X=oO#nCX1J+QLr$EGE`~f0uiod5=U-f+5s5?w4g&Q+*N=thV7wz6^FlwG zprX1ncL*IlkN(WTNxDXD*zo!4lJ|M?AShS0{$?HI{!*>59EL&qh&`bRc6*)mF_yRm zR&JtjhAj3J^HEg_-=&Yw1d($mB6)wr-z8H4pI3K9TXf~uyE<@@;tm)`Nc#PXp}$tj zloB0GGgPXBB!Zfa@0&Z8t2@>92ol%mc)IghO=Lxe-eTZepSIq(EwpJb9UxWo68+?Q zgdzhYlrVpxP7is$op?!h`x^bk+-U)(5fRi@YL>i-!u$v<2A8uFFMxaKl=$-HzVSpJ z*RW z=}VGza*x?e4ty|qetVzh`=EniLD(fzbP&)?dF9>QQ45c*-*0HFu0Sf42SEdxuj!xg z*l#RE*HXio&7~XwH^c!qQr3t5zdR7>qIp5k#(#vGKkkv^OhT56^~z@IS3QP{&6kx$ zKVX!puUKz#l@`?0$k#H9*H45@O*aRurT!5l}&q7E1b>g>u+`9-yB0F z=G$|&ol3u8tG^B7mp4899B5Lhg5JJ#OWvPYoy}}IUXv;<-MSNH$5p|Dl?$E=;-v1# zd>ZhRM|i~k*+b8t*h+!VSTv6DR)AJHqEfi5#!LKlzhrv>ZE=dR@z57DKq#0FN_@5F z`ss&1oyOJ7W{dXL#HVhvBy3#vCqS0{(-UgyuUfs6>wzwLDggH}4?noE9Ek8(%TD=N zxZ}%bX^GE#%Oc|8;&Rd4AM!pYm;gDSWpn`D$(|8^|Bh4E2=-z=MMXtZQ`6lzmOLR5 zfBJ3p)%7O2uR;I42J9(+G}hWSp#_Hg{{0m&-O66{Dd)rJBE*-M&f($Z;ez!i^Z<($s zyNs8=iBAI@HVC@yR^#Bb$>3JqCnk}|&a`q{!ZF1m`ev0{1Z8Dd5Bc0I$5#5|6rc?* zbsHgcTQ90R&u8yRrO$eBPXu=Z)(AF#L)tbFyS!oZ+Wo&15n#tveNAnR_hNW|LAF2R zqvk#WA>kIaY?;ldn-C< zlakfN60};Z^uxQ&K`Q;xkE+&!%vI#PfPo~JY>1{9TD=P)3Dkx_<9=KsyiJuaheiP1?|mQvGeTK?0h#4^jq8C2?pdKgZcm$ zHaBb6A*VOdjtSuKiy_(zEq(OaG}+*ZJl*`Ac9>K%)a`J$KY?Me?9;!H%)c$oo-_qk@l_2`L9TJ};at{DXiBkDrSIMV zT~VU_Gmpye8=F;t7bf~Z&bBlZ@(WVL^UV4hkoadF8~xlS3cPz(`yq@Wmnira7Iyr4 z?kakpLa}Eu0WKz&o=q9J%!m?4$f+8!Q1amT+--%DnuzHW4r`w>8}p0cA%P6DkstJa zWP$N`U}mXJZi+VtdA`YCqfzC$cVEZbE|pew^AUB0&k^pNbuV98yHLa)_3Z;48(HGV zPf(v<+Q8q}*3m$%hsPKM$*J!7n|9EhgU*G?-8e}9a{>OftJ8AK4?KS@%&*@Q1I1Ea zgbx#vfkYxbpdP23Fla+%%{Z9dM46hJN=k~lVNZyTW~yPYte3phG?#2t?%jDNO3&+Ft(F!?uFEcB}C2JY1dV2xB zqRdogu7a9!pZk0>q=bms6ell$v}|I;!j)yQBN9IMO6=$vP@U7C?d4-_l|e1PI4&1S z)17tU@#tGFJGbwdJ=D{4a5yJ(7zb1|k4u&$>on!YDG3P7A@Os1A=B@)e@qSVUdgKv z4{O6Sc!-!{IE+4*-j{B@abDQ7rv{&mI`Nb`MzeN2iLl_l>uvINJ$m4cNOj-W1_5Y_ zoVH+CI6BU*LA7tci0+6dGg!GRh%Kmqmu55O=a8PlGbIaZ#v-zHSud5he9#()k`?`_ z@U{A@@Fz@x63C(ya?@HLT+Zs;yLi-=*TL+K*9zOeg0Kgs*iw{gvyhI882AtSB;fqlWzLAqz^RlqjGOZ>2yY z@{|kdiSWnL5QQ_k04*d!j?#}+z$>P80o0Qo+NCczG}##9P|Fe`7HK@l++Ip#9Fg7m z?|t_eL(G&+=GAEr2mogMQ*x-Qe`{!g>72gaK>t<4P~5v0gctD#qu}4@<1cGaq(hfu zRozL#sNE=6GgYjpaPf9{!BT|(e1%>DRh2#d1Q4D9&D_Cp45M_5$;8W^FMTEsJ`+AK zT%YE28A07WQGm{*)(pSIRznlb+HA#Y%^3%7-wusocAsP~8A~BP_}ev54PLZ;$1XHTy3V`zZ`T z7?%}1pi$<~k%}a)b9>))1UT28{K45^Q4BjTo8Ez5hE;%Q>uXU~5zHlA5#;FNlA$Wb z1zv+@YasdMeLi6cjPmhBuCGCLExq;}IB|WGW*bv^cgkiM(WFiONDQ~gut!Ka!Dr)6 z*<0wAQRg#Nby{2A(bp%vh5phv^ZNm;@Fr}ypmg%h(U#4w&(;*P1zt*id>Xq?Y z#GObLciy=@Ax?iBV_hte7<}pLoHp5udO9Y*7vIHjp9kgNF?-UpX{p5<7l3v|zRev2 zgaX7!zhFTV00n;-nH7KuSZ}9Zqm-w4^&5o58%UzRQ=^)7&N8qZg zmfG#}@K~)xT!fh|2a}FBk_hWt#CHP1n71}lKoK+%P@v3I=~tEciiu?Vq|2rjLiqlJ z1k-e#K~dJnx=-VV&uozGYN0P;MyMKthAWk}1Kfzm<)*sQOXWT;^=Fo&$7r)3Js+#L~lEUxgqd$X8Hk2;=+K2Nw4fB$l?yGO{hmII!& zu;b;IBYl`biQ@|BB3N+?V~)3?Bt^Vq@;GJ|F1?G*8Rr~>U_1Scq znvwAq#BnGx+J9NbCJFr)J<{o4|k$BO`Zewk&8~5NBB3Wa?tvb!z)UmJPk(NY zDd~Ygk--mQ!H>c2dBFD+Zkk!tQ&KUbwz6vOOul}I^w1E+A|4g_@k^N5LY51!^-LU& zEP4lTMKt%jXnM5<>+;y!)`FRgf~jmL0?8tQx}?pWoa1OZ)uA&@_+~wqxy^Lzd?s+< zVA!kd!x>KGrXxU|8`dEFDms7rws%%@jSCR;w_xFS|IwJQ@c?@)WPi9%tw3#o@MFUa zDXn2rmlY6%3?68Pn+4XLC}=bae_VYGcGueLv6>T|v(q^4LJ^*WPpVxN=1XHj?6WG9 zxs=Bp&flJQxMr9EI=Bk8(7$9iz`J5S3VcM!W26~togx0!Ax}uEfK#qb7!H*B&j9zw zuO<}L&``b4qIHIQx|dx5{!VXYh$$0N4Z}T7MlFteS3tGV4Mu2?# zA5WMp@PI%JPNyRy59;brZUr;g-=>_{%s{r*(5U7`uSa(10kPJ~{-p_U(5t(L8r>h5 z#1dQtoc9~iJfJ&R1jhpHba8cd=W%)z1-f}lSg%mV7w;_Z;dp8Ocx(us&%}A5N{Qap z!138WXPVoakp_2Os0V)*A;F!f0i{p!f!70ycWGu*$ij3sgS&nvnlK6>fS#ui6?brO zxE*zMb&D|8*LQP!+U{I-BpwQVrXWn|QtQw1v6Y@zB!f+ggSmGU@AgzXXB zKxttBXVyNANBPX8>=2Gu3N1QiBBy0{emnm@P&%Vk?SmOnoeYg1l|qqb`Ik+BLy_-E zoItCtA=25sG_u-1Eg>Pn4FBAa%XE5rw(_!on>&EUqMSgPl;DNd7Bb2$g36N@7{G4@A)mVo20 zU*6f!Cn!Qs6m2aWcD-5*8n`SK{)i1kMm~ne-``{<@;cG28XMc&2LWa4&b}|5g7VHK zlR;I_2Zc5<17JMMv9(Q`lSgsqAmb`=eck9WCOznK2{`gGjjFLdaytdcB%%^Na5vf< z!@mA`C~kdq?egK3Z=rF!JaFtOGuD8-ydImC);)pfW-n%YlTEh;&AX8h1DmXj43l{A z4%Et{51|skKOyrd^K`ND|M#JB$)LrRBO0>1x1Mt*p?IQ_b z4O`yBdO!4{9w4;6vD_8)b7XQ8wKkTJC=3qfe<|xjhr4OEGn&Ecai`8*jyBSkiiloc z-^W+yEdMz5-s(m==kntW8(l)}ciJ1v3w$&?<2+gp;9O84?R8sYc+vO&Ywsgww=&e`H zQ@Y1vYa1#WyUxgt=n99;Zrc)YbtV%2v)Sad*Mf*_?-9gTmW9@>r{n7gqsLwKtG^yE zGqZ50%Z`#Str~W2G?_46fmm&bdv&eDbZ-qi;q#v+V`84P%3i;aWzavdEbs$ZCQgJhd9_ep5T0 z4;Rd|$xzh)XrX0=-1L9hDDW$DHK=xpxy0B0d{+ijq-f6+h?#hu_-mX|pVi zSFuMDL8?IRK6P3iYSRyy>i2&l1C__~^YglMt(^9iyw?tc=vMXrqHImWt{qCq4 zzp}n=XryjO;Z=nB=K@35n9tr%w0t$KnHV#dO*5IJ1!!JF{QBSdo3hQ(c9v$b|2o}<>Af*xdxO@!5j_g+N#q!FGd=6ifQqXYuzs;Z~v z%x*KHpEVvfQVL()Ssvy>mjdY5#Hn>)Zy&qG4v3ItktJl|VH&%kd*fv~vGt(c zOfiRidf6LK3psT{eW?P6j1JPPDg-@_dC2>$4VIJwIsxl;Im#-yMUDLk#GTgg+PxWO z6tb%OSnjSAbE%8DA}bwTU-?q|`tA2kh<5V(P)po`meEy~6j~L-W$TaIn_~ruPkFOQ7E*TmR z53gvi)|^x!n$QYAzEnE@N6&xcEP@nQH3RkK%LuTyq}mneqSaR3nvy#@RCZss7ALJ)X6|o#Z;GjJ_cbcl-U+d0h8&OlnpJ5m0%9y2x=sGFH+J0xgha)lXLkUR_?oy?u)zouL^Fm_e}Ts zKkbTPUy2FVl)W8>cn4If*n~3gw*g)ne_4tNxpYZhY&z!7A8GPFE4Kw%OQ_6_yHzns zZd6{ob4O2}R5)EOD$ghJ|Dj+?kz2Z2(HJ$FY6pl!k*?5LEtl(D@0*OZagraOm05Ju zadM&mxJjIX7|Yzk;>`Kv?1XN|$u5;Tj_acCK77ch4oo@{JB#;ssx<1-A7seoQY~5R z9-{$I;lcxZ&b(GwRmHx6#LmuMq|p2C6l9TFy1E2>YVFoCt*t&ZN_bUf>JMZXO;)iu zhk)8Z)FabRYAF=XzT` z;vV0EaUt&T8MI#v>9#02%!P;qr(r;?Z^?tvnI%Dfh|C4Llwr1l=I?h^6j~IgJ-Yu% z=v}EL7*AAOah+^kVRO99y;JRDOm4c*T~#7lZGTV%Koj`~oDUN7B=MMLH!fP8395}+ zCPb}#SR_IldKMAkgDeNxAK7DdbxSpltG2YTkaAl1k!d^x{qs`!vF;Gg^Pe}@s9!<7 zx>uqs3QD{$Yk9u?q51%1s6^2n&il?-<5uafUkLH(|G17l!5p@*xB$GqTB>O57-N3@ z6F7AGp7)^3;&10d2Lf}#NwOeNnf4G;4N>y5tU2W8(=+XzqK79l&yjDw;vtoItzf8V zw&DWcNSXvbJiL-|HaT7xNJ-UCWPgk8!Mw#U$_lvjbzw>uMl2=d&Z`|*cVJB_!gGgv zjnsAvmWP|{6n%f~gvEoyjk0p5>tl0RMAq{}lLbQ?thv#K!4^1oxCM2;I?hgN8B7}q z0-aS1WWqZS*n{3Q=+`~A=uU2ERx308u-c!#$W&AxHT&(2F7DXg`gZUuf*Uu6`5u>| zC5(?sp_#dZ>51zdLKz#^Xtlk2 z|9&uAX^3i;^zCyK9#iyx{$cwj@a~N?!p}HfU6Jb;@!AEL>fIBAFYV#>Lo3@q`X?BI zNWfp~W}rul^zfGg!oy3A=2ZXkM2{j+hHKL6+P;Is=Tg?j_M|p`EQ1E9`<4E~7DWvZ zEM`rZ2l9_b?4wrhCF8Gpy-}$*e<$+Q!{D52uJ+?xp04#bdOt&8d;=l%E*>4fdvE{p z=^>(+1Rg2Y$M$SjUeA~k$fPV>w{hXkSc2fLW*F~d%+f&+-L=sZ#p}sYVcQU%9o*>A zh7@%C14Z-z^d*wUgQn0DlPOhgEUNDha$}?FJKbF%;O-0b?BIklQYfiZia;R+M_Lg`?|@^-o@tO!c&Vy8^c>ZBKiz5j~Y-~zYIT7 zzG!~x<4EzJ&T#Gz4;r$IM^WKTl&h;>XQx9-pQtGF^bQzVu3&IJ^|ZGOfze51%4SDI zyhLO4#uIo0d9g5TQnKn)(c+>K#%Ar6Q{TbHd3u@GD|U9}hvoKSuHNXHEPv|30QR{T zN;a_?SRd-m^qkLq)YdJb{yPH1g3iN+JTBZR=EtO)RzkR=Fb}eDw|jWtTUD?390H9j ze~Vim7!&519h8L3)1OFJ{s?gQ3+)k|0o3B-<`-DxPCp=(7158fl(18=if{c5kPjCL*8w3$z4lOK_rt)%q)^hV zkN+`;QmzBeQ|G*1si~<2Kl1c+QJ!56YxpmeuJz>3y6jgA7v>H z=n*;2w*EH>v(s^%{b!I3YqcJ@a+x^*~6rQGF@#?-zwfZ*gZWCiEWl7>J3 z=IaZ5UU|%_7w_6#>mrTIv2kkN88zbygH57TlJz_4ne=dqNc8GJX1hxs6z6oRhl7I?Y1PvZEXK(K@-an4MO&O#>z7TwSe7W)PU~MaVMKV8AyH2> zhtix77#Jv(DjLd>tFq%E9DEb)feHFNS)Z)1KmR5tcX3p@{5r8)s3-tbowaqU-|E?Q z2*#Yxz0?)Eu%4qLqEIo7b4T%)p~E_qf`fzc+lvhzD=dHWq*AG9$wQp$HdeH-7)qcZ z{u4j)yVfq{1nLEvrtk*F$E%rUD~^u*Aq>GT^DlqZJN~K{88S;q1!%mReb48q?bugB zP$}Dbr@)aq?{o{z;4$d^yqCt6DW$XI^T8*>}J_6bCb|5rgDb1L^rZ?ePw@~fp1i}p(urNj0+(xPF+iZ!}8{BWY_o#g;&h9l-83Q zNkZ~=0qrBZt)KP#w@t(DCpYejaHrl1nRrHjxbr!UP5r}sL(`)}YfktLwfN*zBI>xk z;Jwa3v|Fsg{TrH=&V(Y4?#-dp_qWg+m7H330z}N(;z6T6klIf0M3-RLBE7y8qp=e6 zpvZ-ywV(%1@XYJVBFpEEK^a} z(n)(EwtO^1RNcSO>fWo@8nd6W-3pv~IL*f`wu;&H=Yz#c3`ZnIcjL99 zOS39Zay}}ELT0Nh3#dJeQjo|@o%+t>akw$op4CU7YQK^U+#5@5t#&Th0V?f|AfsH? zoNw36^?p|>HTFek2vQ&xvOPZjGMK%l8T^qIvh2vN@!=E#!h%66o*R~-WHLbtsRC6; zlMZolaptTRDZZj2HF$-8%%>~pR0};Ma|0iTiG-wOkph!0?}U&0E?>qR%{^7)SS%HF zdSUoiuFJ~m`nR5B!G(mn(an4UsyDhqf;jYd#dC-PA_LRUe>Hfb@}A%r+qyl(uBpj6 zd_$-9uZw!{{554jXmBuS5h(F1r1JXl<3~{~si2t9rDGfFsAn+5pUI){3p3LnmC{J! z5)d?qqVR&C7U#(yUw-Y3^8D)N{JE%9g(%KOE9ZJ*^n^44nR~MwWd(Ib@ryCc=7LWw zH^<9?@YAkodIEn{qWk-;sQz5kMmtJQ=%0ek={Bg~rfxx2XN4zK0D1v*9-xpaT;|}w z2e1xTF?o8rC<`AEM5)}7w|BQ+UXEL+)$9s7)r0a=Iba;~VwnXmoGwmz-@R+my^6Z1 z|6Uz^%`X2LkdJ45Q6H)|o&zW)`qDsie`D$GAM<$afwJ&ln@)2Z@gde+Zu}aFW}4K; z`Qs9X5WaEQ>(35Hj-HSiJlw!v;)j3vnwu@~v6`5eSY;07t3LuylBaX_Q=yh31o7zL z!0MD&;!=}Gf1vI06LeOMCRh#Qns_nR0Eoj2N47nvJ^~RgpD+5txc3hi7ne6~5B0bf z^tc=|{(6ZaB>6(i!4wAA`Ed;mjg5J-`v^hJ(?8v`H2=ucjd#y{B%LPy3hZ14as?;V zT3}OncIT5gn5`ngB}+)9*fX;2k>)9oGZ1}9J=xfsEQA_q)%LD4ioqX~r2tsj^O59K zRk50=QP2o0%A)XR)wQ2IwKMTK=J3oqAJK{DIG%Q+U2E{X5{|Z3JKA)Zo1-Qq*3s6P zLPMNu2-ebvXhn*Pnx&<5dzi7YF$gU7_xI@wL)~NzCaX@sn-LKa_vUJCFGEU8U!`z5 zJ1!=~6W_W$b!gXlgYfrGyX(GddIh;SPRDOzfKZGPjqmR68e-f_{10a)^&}}NsXs^M ztRs@n+W-68u-ZB%we?=-8~Eo<%EPl-YFJrGwRMcD^yuQ)GRDV_wtE zdA7d4(x)}J_}KV=nt)W7pX%TI{Nm_z{9-0S2YQk56g>sryPJJKu7`?>05EY~9_L(f za4frBfo)%HNrOV)&!0bEy?O;=nlQxi8atE9XO#CppKCpasIMu4D_mxYFB+=^+Xxz+ zHh>lqO0!vd`FQ3wHmGN-^ABv|n0JJu89@R3b#eTy21Rj8uMJ)@6vH-=DVN)flG8+ZVfDFX}8 z(ttMQBAb4H+UdbsyXTbE6y4u&N||qZI!*+!`S5p_O-BUHLVNfSn8F6w;B1)v5fnsu z|DA>oGQX&(@p#z|0B7?La6Zk(`Exl){fV1F+5S^|oqaALrgl9>!(r32!wsMb1r5kA zh{fWNynle>s#;q)55{>+4%SHj4sr#W)7EUb#Mfb-eye;bP9{^fHOrg8dSjGO*6m|~ zMT)3zYYR!EnoMO2OgH__!uLG3g@KWC+7+;o9-RdHjySDf->!6Pb z%I!h}qd$NUVzBBwq2D{RegHYP`3Dyl7pN5S&k+z_2`LjuqzI##)x53xdF02og(M}) z(q#~;L;osGT=C6mi!^#9<6HJj0Kt6-ol6jBjrHA`QigKWeJ z`k-^Ck~Ri}bo{U;3F$w6{J7X1zoq3&{F+6(?uhITW_W~@%U!HLaDILcfEASo*p7o5 zk1Pwc58Rn(avhG;RTQXb8%@91u7~&Mx-2Vcu?Vkjxh(#Z=fqOQ?h>H znF{~C+e)soZ*?IiSI3TRcpI94fEq0^QZn*fOKKbu+KI-aQL8P`qRUZktuk-4TI>?O zRikOOdrYO~N0VzxDL&o?(z&V}&g37Y)0P(I$BQt^sh<7Xzq!zq;U2tA>5-~489`5; z`GTNXQ-Z#X!}!~Jn)`BRC7Lz2cKAAO8s<&_f^+2kJv28sCK2B*?e+59KEf6pRlGc3I;Brzr^+TsQvdV?YAmO81lM*AQ zewgb`{E0(_Ef;t&Po^P#A!SA^@%)dYD}5t+dQ{|f4Yvcva$R7vvA45*aCR16v)45p z#h{`yTN;}lt(3lO+HDbDNLR3HtQ2{R({ZQip=|p15e!^_I`6N8L%XZ(4_T;7tJqOU z4o`kVh`E2K*+qnVIN-@MJgVS(yjd^15v41{f9ob(EFMv)6&c%3?TnU#7~O2L%5L<< zknmKQ8P9o}EjU5P{B-hRE@)j~GL+@}Yh6fhg z!v5F!Nn~p)@0J%f#A6sk6J1^$Ewfm>vn6-FC^quPw#G<>yD!w;qEtwCVNYa(KD^DN zew#@W6vZtG`fkm;6@<~<8_2A!o{qV3SML%6pirq*4To}KZQXK9Ch|;9i^@wQZ#rlG z4T3A`dZDjA9sVvIRk(Ig0~s=$oSdU?!$ok|th0dInav?4Ro~vZTtD;QEb9}UlEI%N zfP}y9FCv&Owm>7HUNrl5-F=}jir--YC{PNdq_atHtnFFRZ>45B1PU>X0*%flCuX$5 zYOR5Am{#cR6N1!*%R)8|x@ftFLN9Jb90ZQw$SSxDG|Z32o6iskA+*`Qk-rP#BS7|b z#c@==b$>Ei{&bx~aQ$6>SO5fTdZ5U^wum%u!ewNX&AJD+(%~8eoczVfQatcu1q5^N zwWq@Qvee3X|9%)g)a&Y5^CoW0s&fEBF&$MK5o+bs)!jXC4qr1jP359`{Ns2NK~6V7 z%`J5g#nC*c1Lq*6A$FINNWt#fm?#Vl_0?yS#P&eS-qw{9KI&2Pv1s5DuxP6g>bC7!6pCMy#d43`xAu+1p8T-vySRv^z?gOZ54c^&_V_wH-`=iiBmE9fy3aOL? z6GeJb`6Rms2cbyi<>m4Fr>nUm1^tE=ni+g{1K=je(x zJTNxY)CzAQVM`280x9x5WfNRFDMShhEw~pt0@7um>9zisXqW?8A_|3 z17pZCDYKU2Aw2f_@^Zs}_HTGb@Ei+kqQq#7*Bd1{DXAq7l>Uc;Hl8dip1*Iz6jk2` z8;4#Lh+|lTxCezwnWPI%A68%qRys$yT&q27R?)4aLs+li!vLe%WP-$A#T{cA#(j8Y zu*Er@Nwy2^7d+|UO3%NH_12q;SJsPP0tq@48fg$}ne0q@GRH*aU4HgQXgLXlckkYb zKY)IaEQpZ)Ht9>xeS%jHdSUIH%U`pUQ`R&544Jx^uh;1+qTAzGnq)hAZ6PZ0JA`=Z z^t3RvN!tc6Zx4CR1sYxo3=7wFHaD}8k&&~MCzaW@RoTIa#rc@@w@|=G-%7Ul#)BVk z-;OpnJ69i;E-6M|I<$<9RgNCaDl!f?_puC5Eoj>RZVY1WQ^mBQy0DyE#LCgSKa6_o zFr}+~}%?79RwF=(XAo!5Tt*opJn2&HO zm8jU*>{%OQ5%w6%DvMI<&CT&nx%69;kzoG_Q<(j{dxzR&BCmaT9Gk~`9@-C^gVk_2Pb8WVZO`KijEL=N?u5M+kv_Gyj6|mkE)^)xm?wF$3q2vetsEaJ7m2vfGae`39it`5V0^`x&81P zDy8|)-Sc}gxnfSWX=%g5Enx3}?s%@LPTv8y_s9-2Ai4SsjrQGwu1xJ^U%f}EudfWv z4I+Kcolxv(Z8tXO zcEr*spr7n#dcTLq=}9myu3~MVbt7|%`pph`EWo|`1>80;n1)s5mx5Qj`y26 zSb8afL~bui(}!}0b~mYIUdt4di(oJW^lqVcC#|<;pC7lPV$@SFfJ=%+C$jVbmLGr!^Re0ILvLY;)wM_DTQCs){)-Qc8$jwpJ&-T5I^hfc6IXHa0H zjDiks6pZuzKCz}4Hs{ewyNPFL2m{_v=$a7!hzow+#~np9q>BQ_c!l5;(b&`V!tN;E z0zEbE93)>LY%DBxLpkkk9b+bCGdn-sHp20cZ-qA_ikrJHJHP9wCoLbU8mw$@lIM6$Xo&kD0g)+IgX~bxHyld^a2Be&H3^i6x3tjKQ?&6 z%)&BZJ!S-e!<6IE7{^suuEN~bT6B1nVrtU6!iNR7_N)$vWP6O~sc{uek_sC!YB*OO z8iV=}Ha2$sSGeHszd>%mcY?)keY${`E@qXv*lSDMhZrI>&SiC*Mh26gHeEx7v|DGn zH8ftlc#$F;DOqaIxzO60Mc3p98Sm{E$%(vfC~o%0sReMhkfWlk%;%-U+4dPsWqD;J zVLoIdx*>VowYPr4Nx@w07the?bwMr=WW>V+8k9rpfM3G~th6r%Q&i~g;Q{(i&xVbE zdYDZm%^K{j-bmZH7N#$8M(!-2;y7!J1XgZV7@Y2}ym|BH<`Gd02G@lL2W!O1yVuOK z6VGOPNG8}?1>!(Ppql%7CzVPlXqk_{UtYw<#JY_Y!L2;sT5q~qlqNU_$135=dCi-N zX@?H_`e4z{daB-kAJJI&XZ)(u!!v5AoskvVjqL3zT&K&4xwV%STA!>uKSaK7rZd?^#ZG$+C>!4pMOa%@!yxDKLyuv@VKvWaL{CH!I${s=~EES@DI`3Sp`I{qqDz@&o@nFO$DBbmL4`fI854E?egeYEi#(FGg?VUQ|q|Y zzT^|~^786DTv%EncKZQ?ZCghX*Xi|wIX)z$!p`D@tSqk#KgaD|(|L8CMMVmkvX18$ zM=9tfU9n>+&dx_J|9FrmC1AN`Wl@yx9)kKnrA1GJUhAG750kPLs&<0gL32-|Kf>fH zFlk-tt5ss}GgmV+GXr@cEOy&GElCD1un-Ka^7Ig0y;+DnkOgr*pThwZSK)fLfBaFl z%>fpDkf|zxImy~ND zVPhkj!(kF|)iLq88I%-c<`z~m(GY{5PypSfsXYm3=%#8PEH+u}Wlf(~$}`~jDas&n zUO)mAX0c?v>5DeT>T~F+#>Q3+OK*Un0j)>t6XhQoXCv2YqD_j;?Y{jKqOP8wnfbs_ zjLuuok${dx5p}0zgq!k31dGLDl0Y!)>p11j5*Czwg zd3noc$rzgAMF-6)vIYiIHPxRKrp((Z!Ar89x_U!H>)33CaXse0xMH(N@%C;{pi;@v z*f7vCl_I_YO@j~OfKJW$bD>MUKtwEucSlGH`en{yygT1YNJtoVFvYCiaO~MfG3R-k z)N*}-MAFi&a?sgq5Z1{SP&S}n3<^rX@`3W<;wZ%pJOyp>jb~_ZkSGGIeL~42SUuZ(Y)ad91%Mb;Yz z21YD~d8f6&Cc7{yxaG<90SES%CrfIdc6c+d+$ky_Wnzq9^d_Xu9T<^m`b$F?RlD*N zS`tNSw;v19u)EID?ZziwetuzZe%Hfu8AiVAV0az?gWeJhH%O(*J|C$gT$zc3GDytn zG+yUAIdG-zI`mI~%FV~0&U4X`A9nRdJ)1Q1NdMa8i_xxf$mfr}Vd{=lMdk_wus!q# zAnYPSE4BhrVJ2C+?%)S`SB+Lre_z2oMV)9yBTaGX`pxb}+WVQ!QT!0G{MQqaK>?CC zmepz%!=&mJ;<{mRYhODL(Lf_{k^WddZy(s=KnoxH*7O)YCfH;J+qUG6m97Vl?V}*0 zjI^P=c!B3{cX2j#wK+GZK@PtYN?+F*E@#sET31b)bC-%>Yy|6@xo{&@_Cq(173W z_nG6ThTywvzdZw0(4-C=!&dc?s-$GlMUK5~1(O~c@!sl33|_v0TsdkoiY(P{;iah; zl1VoKr?|rd^8yPgvE)=tF%C2Oh3%hn0%v~fQ_+SxdaeQ~AmW1OJf1Eyn;Rb=*Y}q} zUe^QoPEVX5^VvF6_gM2<=^Kf@Q#f=rbR^zOMv|bwpu0~lC^YX%YDrk`*uf{|3-ZG` zKWMq+@G#_Hg;vVeh$8udfN0@k>rFt>l^GHsPYX9O@ zvv2Ib3nR_z3cEzt+l}dOWVo09doxG$;Ij%oj|!?IXyeWFP<;ogxo9#fUM+8%2&Notdd0kQ!yp?DTrupO~% zM}SID1$jf64gRk@v?M{_1KFRpjWnKw8$I>PY4gAF{R8rK^dvzZPzy`Qnkp?U&YPbf z?4RK??T1Dx(UPq8rTROOn#-2BP{5u5j&P|Xve5oOzpn3hCq;4rAqnCj6GgT%}Um zJ|Zbn9Icrcp;cCVlrgb6T1*H3Hx5}GEwL|wCU~rORI1EPDM@lx-OezEj zN1n92!ro~kCp(CX<2bX~%wjZx7EMYPiD8jS=A_Z&ul%~eb!pjN<7h&qnk86$Sa<)! zy?bC#>c5+O)w0w~1g$KrVp%?X&W>q1))NXd6c(m$Y#7Ww8*d40l2)2=?uxS7??98% z9}1&ve)dwGJeB5hGF{5si+XDCv}<$1ckiq3h|dh&QrCrJ(^KTV<>{ZX(ex70B~W9* zL8>Zc;>k7a_LiFb-$sj&Qw;`{MXk)E_KxRD%+cqjK*Pz#=0r+FT2D{-W@UGT zJ~{hJu-LOWc0Iw3@uHPPfYPsDdC4@@(b>(^__XHol&!*wE=3VeM^lSqw~LKT@4U`+ zs8shmn70#)`F!Kap3G{9L?ZeZk4zCISLCshPSs+;Z_4RY&NN#xE_S@>HglxNJ{^`ShlH&!qn^-GEQ=)qwl3t zve8FJYc@O>(R9_)C63MhTxTn(`WC(0Grsuv*B=(X(%LcaQ@NnymWlJ;;kGzf2@9v= z4TNplohO$Lp{%n_81WTn#@*W$oDzYP5)!I2GotK21y-b6 zU9zgdahh3jxpDbccF=_BOI}?u!=~1YP>yuzGOF|@eJlG`^h-5cAzX1SJYv*o99>- z(TL}#*%ANypMR}uz9!68WIgjRSqh8pq^MWXh||ZKnxN=4x44Rn*RD;3)}9X17&M{mi7cLo2ljV$kj-G zvZd)qS^w-#GqD2c#It=tRcnAp!3D!=Zl{DM;X{7FR^{!?2^3n&;1 zCFf0i?<>g9Pf|)Vz{+#N^*Y1qNSFPV6Z_Xn_rG=>qplW>V?P1pt@Fu3_^NZ@hK-c* z7=?2aJ8gAcnZV0`k%cQd1@77!=5nm?cMO2_vuGZ{l3!gJVy4C8)k;mr3FXeuYxX7R zf~+ema+@UL3EiTmxMn}@zeZiW5E>H$3GU_V>5?(y;5UJ_wa%S~>3-G03pg57)c{+9 zToSIFSA|N{z4fWIz;gkAuYy90?wD3C2SzUZMD9ykIw3065Mr#D?$$uA0WF5dN{#Yx z>ztpCHSH@%(-ka>2H3R&QH2DZIBmswC&$x+pNxTSq9qI=l@>}1iFVmHkskUKGV;=z z*GA->eFfQq+Tri#O@`VtNg+rwf~UG+`UV!=f%pZVe}#})tQl$1H3<#q^Szi7(n!hD zY^Nl79o9yVX`r#7{UH+Fn1XQ6vXfSRBE%grBgA9o*_13@X3M-(sx+xywpS_6u)}_u za1!H!=c8vO@e0YtLV}XSJf*VP(p9l8`zj1#1+ndI1;Gl0vF%-qVQqbD;p5yB@`3}} zT&BnSdNP8C23&=Lp`^R|K{nqG`A+?klE6+01_xSZpEEi|6B5$+!4s0z%G8Mq1%u!T zHAqh|#RaUP3Wyry1@|s&=^11~y`I4pOV!c{Y?|f@p4i)=1+Rax!D3B%Fxu2O(wZLR zMWrgu7SuA9D_kIlmi2J{HJVU*qBcXJmvCnXeHhtb^tsLWKQD9gv0IBO??2>@9l%YI z+#N_n(PbENFs^GXaJgS$DREJZ_>HWRDvg9Bt+o1y)gVhTUy4{@NV0CNx82Y`Zc~(I z#(0-_hs6#Q&y|tDvmTRCwnk8{z5RNQrlHxDY}SXAGnMDAm82ys(8`kI8JEqW&;3xF)lsq3-Scxy7_uYy z{>$?#twL=k83Q+-cC}@#wom&#sxatkwtCokqp{g8?3MMdZPc2o*xO?ngGkgS;3kva zg5V2%k@Ut!-e=M|SNx?(YNe@|)O!-*#+jOElI3>7XrkC0)7*0w2Vn)8NwMY$<_Z(v z`)p4r%oBFE?GLQT!r5y#M@jf(=?Y1eG=()f&pm>jPOOd=2Iiu836%0DO`7P4;n{FC z4i`RV%Ix}CBQ{7fA zxOme)1)|DnqeIcPCefx=^L;^!T{3v2|+UI4EfL^XEPkxq!E9 zmXFBUs(Z7r`%=6sE4{+cSUvQF$2+QN*+!&$m>6YKr32I=*#sZbn_#Dy!8+F#xYY$5 zAHVi}RecmjnIR=gMa9|~e2%@hm@b)9Qxlb}{B$pfLyMvCVS;d&3|o+kn6Qs%`hF5^ zVG2|f;{fS`^MV9&3dh$X15^}bt?|l%yzWq?UsqwP^GSvHt;_kkff=0xV@ib~^U}%e zJUk3SXv8(g@A%e2+G+|SeipG^5s6p|LWgjSxs8^Pho-VdNLpGGuyDY^ zZ<-$L_EIcp+tzsSi3ezt+Nq_*g3HKxJ4al-HJvlxffGSzcX(aK;m?@8%XyX%VDk2n*+V=r^xKoL>}Q1Z zCnyM_aHUE1D#l7KwTTuUdiw$$vssTA4SWC<%>=I}aoLZ#g-q^~4AkR<<0-s_>CS#X z1>u@sf6YARAcc%}q${@iq$eWql-KV++oi6QBuP-9iO!| znj$NRDW%IV37bhxO(wb{Xoy0q3af^qr|99M52~Mc4;VzgC?O% z^$>NBKS^CMz_!S_cmje_fxeD>+UHq5cX1Kc?xH=dmOj{t09Vxm&9 zJ|UoRKYsT6x{BnwiZ}gHo+Ei`ky|Z1g(ms_D<$Z?-L@?FqR>u?$@&R_RC7{7md>_S zLq<>QqivGfonF4!(T?4QFO{qGHO6`Pp&k!)^HFtJXqiTb#uQQ)dezz;Y@=Vv8&^~; zV@t&gWvWpadx63QJWhrGzYsq94VVQau>2e zP*KD_wf^`z4i^>1tOp)2tde%N0VroDeSUdD&Q^+4srxdC?bAPHjwzp4rE7HdU_G;Y}WD|!kEp48wHJ5HD$5}-?Q%8$r<&qvbffL2RCEVi(qAiNLdL{2lmpijCdKqV=t(wq5;VVu_H_}%D+ZIKSC z1FYY?YE@1StxcoSW+V9(4}}oyd^tBD>j6I>R?dN3jK7-B3S~r9eNDsY2l==^!(!D$ zM{0F@MoUXuP*4D>rJVE46&URB{1ea3T7$H6$)~kJ z9Q-fJU^1y3x=SPcOV+3aJph`;+`Th|fI+@f1C5kRxc}E8D85qA&hrTzSMQzHuz^3q#EVh6imo;!Fg*kq%5Y^t4u@}D`L))V#k5-_cVfHO+mFW zT`qepx9t8kvKyb6kl7=Uj0Hu?11`!_ufuxJwZoCmnk1vPK1RCd3PNL`RLdJhcnS0K z2joODE<8`OJ-cUD(z|uO>gnrSTnj6e@hKM+qKI$?(sy&jvbIVwKm*>(Ej8W=Mzi*O3%k>uFqmx zlIdDH8av=JlKVQ-jsl;-d8kFl{UXd{v}omYEjJt?>KrP_HRqGHK%v$?RXt;S?BL?d z)Swr9*mBa*t;=k8mF+Iyb(?yHP?!JXR|c87Ko6~cY1mUparFOs>HqQvZc0@3$TyT+ z_m=;=qvpRk-2ZPslA$Q*NQ!YGspbFWMgL>$kn8^62tc*S|N9F6Zw-J#{Qp7@j-Ao% XPmg_0C#wDd{{(nN-{rhj|NMUdQ{DmU literal 0 HcmV?d00001 diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 08ea12fce24..494ae1310af 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3273,7 +3273,6 @@ def _exec_code_on_head( executable: str, detach_run: bool = False, spot_dag: Optional['dag.Dag'] = None, - service_name: Optional[str] = None, ) -> None: """Executes generated code on the head node.""" style = colorama.Style @@ -3392,40 +3391,7 @@ def _exec_code_on_head( '\nTo view the spot job dashboard:\t' f'{backend_utils.BOLD}sky spot dashboard' f'{backend_utils.RESET_BOLD}') - elif (controller == - controller_utils.Controllers.SKY_SERVE_CONTROLLER): - sn = service_name - logger.info( - f'{fore.CYAN}Service name: ' - f'{style.BRIGHT}{sn}{style.RESET_ALL}' - '\nTo see detailed info:\t\t' - f'{backend_utils.BOLD}sky serve status {sn} (-a)' - f'{backend_utils.RESET_BOLD}' - '\nTo teardown the service:\t\t' - f'{backend_utils.BOLD}sky serve down {sn}' - f'{backend_utils.RESET_BOLD}' - '\n' - '\nTo see logs of a replica:\t' - f'{backend_utils.BOLD}sky serve logs {sn} [REPLICA_ID]' - f'{backend_utils.RESET_BOLD}' - '\nTo see logs of load balancer:\t' - f'{backend_utils.BOLD}sky serve logs --load-balancer {sn}' - f'{backend_utils.RESET_BOLD}' - '\nTo see logs of controller:\t' - f'{backend_utils.BOLD}sky serve logs --controller {sn}' - f'{backend_utils.RESET_BOLD}' - '\n' - '\nTo monitor replica status:\t' - f'{backend_utils.BOLD}watch -n10 sky serve status {sn}' - f'{backend_utils.RESET_BOLD}' - '\nTo send a test request:\t\t' - f'{backend_utils.BOLD}curl -L $(sky serve status {sn} ' - f'--endpoint){backend_utils.RESET_BOLD}' - f'\n{fore.GREEN}SkyServe is bootstrapping your service now.' - f'{style.RESET_ALL}' - f'\n{fore.GREEN}The endpoint and replicas should be ready ' - f'within a short time.{style.RESET_ALL}') - else: + elif controller is None: logger.info(f'{fore.CYAN}Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' '\nTo cancel the job:\t' @@ -4759,8 +4725,7 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, job_id, executable='python3', detach_run=detach_run, - spot_dag=task.spot_dag, - service_name=task.service_name) + spot_dag=task.spot_dag) def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, task: task_lib.Task, job_id: int, @@ -4834,5 +4799,4 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, job_id, executable='python3', detach_run=detach_run, - spot_dag=task.spot_dag, - service_name=task.service_name) + spot_dag=task.spot_dag) diff --git a/sky/execution.py b/sky/execution.py index 0328e07d833..2288a2a6122 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1109,6 +1109,39 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: ) # TODO(tian): Use this to check the endpoint and if the # service name is duplicated. + sn = service_name + style = colorama.Style + fore = colorama.Fore + sky_logging.print( + f'{fore.CYAN}Service name: ' + f'{style.BRIGHT}{sn}{style.RESET_ALL}' + '\nTo see detailed info:\t\t' + f'{backend_utils.BOLD}sky serve status {sn} (-a)' + f'{backend_utils.RESET_BOLD}' + '\nTo teardown the service:\t\t' + f'{backend_utils.BOLD}sky serve down {sn}' + f'{backend_utils.RESET_BOLD}' + '\n' + '\nTo see logs of a replica:\t' + f'{backend_utils.BOLD}sky serve logs {sn} [REPLICA_ID]' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of load balancer:\t' + f'{backend_utils.BOLD}sky serve logs --load-balancer {sn}' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of controller:\t' + f'{backend_utils.BOLD}sky serve logs --controller {sn}' + f'{backend_utils.RESET_BOLD}' + '\n' + '\nTo monitor replica status:\t' + f'{backend_utils.BOLD}watch -n10 sky serve status {sn}' + f'{backend_utils.RESET_BOLD}' + '\nTo send a test request:\t\t' + f'{backend_utils.BOLD}curl -L $(sky serve status {sn} ' + f'--endpoint){backend_utils.RESET_BOLD}' + f'\n{fore.GREEN}SkyServe is bootstrapping your service now.' + f'{style.RESET_ALL}' + f'\n{fore.GREEN}The endpoint and replicas should be ready ' + f'within a short time.{style.RESET_ALL}') @usage_lib.entrypoint diff --git a/sky/serve/README.md b/sky/serve/README.md index 34c1406cd1e..1131849a8d3 100644 --- a/sky/serve/README.md +++ b/sky/serve/README.md @@ -8,11 +8,13 @@ Sky Serve transparently handles load balancing, failover and autoscaling of the ## Architecture +![Architecture](../../docs/source/images/sky-serve-architecture.png) + Sky Serve has four key components: -1. Redirector - The HTTP server is responsible for receiving requests and redirecting them to healthy endpoints. +1. Redirector - receiving requests and redirecting them to healthy endpoints. 2. Load balancers - spread requests across healthy endpoints according to different policies. -3. Autoscalers - scale up and down the number of serving endpoints according to different policies and handle recovery of unhealthy endpoints. -4. Replica Managers - provides a uniform interface to talk to SkyPilot. +3. Autoscalers - scale up and down the number of serving endpoints according to different policies. +4. Replica Managers - monitoring replica status and handle recovery of unhealthy endpoints. ## Usage diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 309a2acfb97..6281b3b4a0b 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -5,6 +5,7 @@ import logging import threading import time +import traceback import fastapi import uvicorn @@ -15,7 +16,9 @@ from sky.serve import constants from sky.serve import replica_managers from sky.serve import serve_state +from sky.utils import common_utils from sky.utils import env_options +from sky.utils import ux_utils logger = sky_logging.init_logger(__name__) @@ -76,7 +79,10 @@ def _run_autoscaler(self): except Exception as e: # pylint: disable=broad-except # No matter what error happens, we should keep the # monitor running. - logger.error(f'Error in autoscaler: {e}') + logger.error('Error in autoscaler: ' + f'{common_utils.format_exception(e)}') + with ux_utils.enable_traceback(): + logger.error(f' Traceback: {traceback.format_exc()}') time.sleep(self._autoscaler.frequency) def run(self) -> None: diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index aa4ca187df0..7e00e3a16ae 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -309,10 +309,20 @@ def __init__(self, replica_id: int, cluster_name: str) -> None: self.consecutive_failure_times: List[float] = [] self.status_property: ReplicaStatusProperty = ReplicaStatusProperty() - @property - def handle(self) -> Optional[backends.CloudVmRayResourceHandle]: - cluster_record = global_user_state.get_cluster_from_name( - self.cluster_name) + def handle( + self, + cluster_record: Optional[Dict[str, Any]] = None + ) -> Optional[backends.CloudVmRayResourceHandle]: + """Get the handle of the cluster. + + Args: + cluster_record: The cluster record in the cluster table. If not + provided, will fetch the cluster record from the cluster table + based on the cluster name. + """ + if cluster_record is None: + cluster_record = global_user_state.get_cluster_from_name( + self.cluster_name) if cluster_record is None: return None handle = cluster_record['handle'] @@ -321,7 +331,7 @@ def handle(self) -> Optional[backends.CloudVmRayResourceHandle]: @property def ip(self) -> Optional[str]: - handle = self.handle + handle = self.handle() if handle is None: return None return handle.head_ip @@ -335,13 +345,17 @@ def status(self) -> serve_state.ReplicaStatus: return replica_status def to_info_dict(self, with_handle: bool) -> Dict[str, Any]: + cluster_record = global_user_state.get_cluster_from_name( + self.cluster_name) info_dict = { 'replica_id': self.replica_id, 'name': self.cluster_name, 'status': self.status, + 'launched_at': (cluster_record['launched_at'] + if cluster_record is not None else None), } if with_handle: - info_dict['handle'] = self.handle + info_dict['handle'] = self.handle(cluster_record) return info_dict def probe( @@ -670,7 +684,7 @@ def _process_pool_refresher(self) -> None: logger.error('Error in process pool refresher: ' f'{common_utils.format_exception(e)}') with ux_utils.enable_traceback(): - logger.info(f' Traceback: {traceback.format_exc()}') + logger.error(f' Traceback: {traceback.format_exc()}') time.sleep(_PROCESS_POOL_REFRESH_INTERVAL) @with_lock @@ -692,7 +706,7 @@ def _fetch_job_status(self) -> None: # We use backend API to avoid usage collection in the # core.job_status. backend = backends.CloudVmRayBackend() - handle = info.handle + handle = info.handle() assert handle is not None, info # Use None to fetch latest job, which stands for user task job job_statuses = backend.get_job_status(handle, @@ -722,7 +736,7 @@ def _job_status_fetcher(self) -> None: logger.error('Error in job status fetcher: ' f'{common_utils.format_exception(e)}') with ux_utils.enable_traceback(): - logger.info(f' Traceback: {traceback.format_exc()}') + logger.error(f' Traceback: {traceback.format_exc()}') time.sleep(_JOB_STATUS_FETCH_INTERVAL) @with_lock @@ -770,7 +784,7 @@ def _probe_all_replicas(self) -> None: if info.status_property.first_ready_time is None: info.status_property.first_ready_time = probe_time else: - handle = info.handle + handle = info.handle() if handle is None: logger.error('Cannot find handle for ' f'replica {info.replica_id}.') @@ -862,5 +876,5 @@ def _replica_prober(self) -> None: logger.error('Error in replica prober: ' f'{common_utils.format_exception(e)}') with ux_utils.enable_traceback(): - logger.info(f' Traceback: {traceback.format_exc()}') + logger.error(f' Traceback: {traceback.format_exc()}') time.sleep(serve_constants.ENDPOINT_PROBE_INTERVAL_SECONDS) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 79f3277c740..e5ddb1eb6ba 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -243,6 +243,8 @@ def update_service_status() -> None: if controller_job_id is None: # The service just registered and the controller job is not # scheduled yet. + # TODO(tian): Remove this once we merge #2736 and not register + # service previous than the controller job. continue controller_status = job_lib.get_status(controller_job_id) if controller_status is None or controller_status.is_terminal(): @@ -283,28 +285,28 @@ def get_serve_status(service_name: str, def get_serve_status_encoded(service_names: Optional[List[str]]) -> str: - serve_statuss = [] + serve_statuses = [] if service_names is None: # Get all service names service_names = serve_state.get_glob_service_names(None) for service_name in service_names: serve_status = get_serve_status(service_name) - serve_statuss.append({ + serve_statuses.append({ k: base64.b64encode(pickle.dumps(v)).decode('utf-8') for k, v in serve_status.items() }) - return common_utils.encode_payload(serve_statuss) + return common_utils.encode_payload(serve_statuses) def load_serve_status(payload: str) -> List[Dict[str, Any]]: - serve_statuss_encoded = common_utils.decode_payload(payload) - serve_statuss = [] - for serve_status in serve_statuss_encoded: - serve_statuss.append({ + serve_statuses_encoded = common_utils.decode_payload(payload) + serve_statuses = [] + for serve_status in serve_statuses_encoded: + serve_statuses.append({ k: pickle.loads(base64.b64decode(v)) for k, v in serve_status.items() }) - return serve_statuss + return serve_statuses def terminate_services(service_names: Optional[List[str]]) -> str: diff --git a/sky/serve/service.py b/sky/serve/service.py index 0ca75051f82..e58db407e67 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -8,6 +8,7 @@ import pathlib import shutil import time +import traceback from typing import Dict, List import filelock @@ -98,7 +99,10 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: backend = cloud_vm_ray_backend.CloudVmRayBackend() backend.teardown_ephemeral_storage(task) except Exception as e: # pylint: disable=broad-except - logger.error(f'Failed to clean up storage: {e}') + logger.error('Failed to clean up storage: ' + f'{common_utils.format_exception(e)}') + with ux_utils.enable_traceback(): + logger.error(f' Traceback: {traceback.format_exc()}') failed = True return failed diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 8ddfa875e9c..f6d0b57ce33 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -163,6 +163,7 @@ def format_replica_table(replica_records: List[_ReplicaRecord], StatusColumn('SERVICE_NAME', _get_service_name), StatusColumn('ID', _get_replica_id), StatusColumn('IP', _get_head_ip), + StatusColumn('LAUNCHED', _get_launched), StatusColumn( 'RESOURCES', _get_full_replica_resources if show_all else _get_replica_resources, diff --git a/sky/utils/ux_utils.py b/sky/utils/ux_utils.py index 14c7be937f3..f4dd26a2da9 100644 --- a/sky/utils/ux_utils.py +++ b/sky/utils/ux_utils.py @@ -9,6 +9,7 @@ from sky import sky_logging from sky.utils import common_utils from sky.utils import env_options +from sky.utils import ux_utils console = rich_console.Console() @@ -96,6 +97,7 @@ def run(self, *args, **kwargs): self.func(*args, **kwargs) except Exception as e: # pylint: disable=broad-except logger.error(f'Failed to run {self.func.__name__}. ' - f'Details: {common_utils.format_exception(e)}\n' - f'Traceback:\n{traceback.format_exc()}') + f'Details: {common_utils.format_exception(e)}') + with ux_utils.enable_traceback(): + logger.error(f' Traceback:\n{traceback.format_exc()}') raise From 61814979e9f150af3ed76d5855f50a40206ecfbc Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 00:13:44 -0800 Subject: [PATCH 190/223] comments --- sky/cli.py | 19 +++++++++++++++++++ sky/core.py | 4 ++++ sky/serve/service.py | 2 ++ 3 files changed, 25 insertions(+) diff --git a/sky/cli.py b/sky/cli.py index b7d4b8b8ebf..08d353a69d3 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4181,12 +4181,31 @@ def serve_up( SERVICE_YAML must point to a valid YAML file. + A regular task YAML can be turned into a service YAML by adding a `service` + field. E.g., + + .. code-block:: yaml + + # service.yaml + service: + ports: 8080 + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 1 + + resources: + cpus: 2+ + + run: python -m http.server 8080 + Example: .. code-block:: bash sky serve up service.yaml """ + # TODO(tian): Update the example after we move the ports to resources. if service_name is None: service_name = serve_lib.generate_service_name() diff --git a/sky/core.py b/sky/core.py index e94b26614ed..11f6faac485 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1045,6 +1045,10 @@ def serve_status( Returns: A list of dicts, with each dict containing the information of a service. If a service is not found, it will be omitted from the returned list. + + Raises: + RuntimeError: if failed to get the service status. + exceptions.ClusterNotUpError: if the sky serve controller is not up. """ if service_names is not None: if isinstance(service_names, str): diff --git a/sky/serve/service.py b/sky/serve/service.py index e58db407e67..e44a1bd452e 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -132,6 +132,8 @@ def _start(service_name: str, task_yaml: str, job_id: int): requested_resources = resources.Resources.from_yaml_config(resources_config) status = serve_state.ServiceStatus.CONTROLLER_INIT if len(serve_state.get_services()) >= serve_utils.NUM_SERVICE_THRESHOLD: + # TODO(tian): Probably we should raise an error and not pending here. + # This busy loop is also a ray job and will take a lot of memory. status = serve_state.ServiceStatus.PENDING # Here, the service record might already registered in the database if the # controller is UP, but also might not if the controller is STOPPED or not From 2c0f3c826043859f7cd4f203dc33521ff89a03e0 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 01:16:41 -0800 Subject: [PATCH 191/223] move port to resources --- examples/serve/gorilla/gorilla.yaml | 2 +- examples/serve/http_server/task.yaml | 2 +- examples/serve/llama2/llama2.yaml | 2 +- examples/serve/misc/cancel/service.yaml | 2 +- examples/serve/ray_serve/ray_serve.yaml | 2 +- examples/serve/stable_diffusion_service.yaml | 2 +- examples/serve/tgi_coder.yaml | 2 +- examples/serve/vicuna-v1.5.yaml | 2 +- examples/serve/vllm.yaml | 2 +- sky/cli.py | 6 +-- sky/execution.py | 11 ++--- sky/serve/controller.py | 4 +- sky/serve/load_balancer.py | 24 +++++----- sky/serve/load_balancing_policies.py | 7 +-- sky/serve/replica_managers.py | 47 +++++++++++++------- sky/serve/service.py | 3 +- sky/serve/service_spec.py | 22 +-------- sky/utils/schemas.py | 5 +-- tests/skyserve/auto_restart.yaml | 15 ++++--- tests/skyserve/http/aws.yaml | 14 +++--- tests/skyserve/http/azure.yaml | 14 +++--- tests/skyserve/http/gcp.yaml | 14 +++--- tests/skyserve/llm/service.yaml | 10 ++--- tests/skyserve/replica_failure/service.yaml | 20 ++++----- tests/skyserve/spot/recovery.yaml | 15 ++++--- tests/skyserve/spot/user_bug.yaml | 15 ++++--- 26 files changed, 126 insertions(+), 138 deletions(-) diff --git a/examples/serve/gorilla/gorilla.yaml b/examples/serve/gorilla/gorilla.yaml index 5549b7156dd..8e01c713932 100644 --- a/examples/serve/gorilla/gorilla.yaml +++ b/examples/serve/gorilla/gorilla.yaml @@ -9,13 +9,13 @@ # sky serve status --endpoint gorilla service: - port: 8087 readiness_probe: path: /v1/models initial_delay_seconds: 1800 replicas: 2 resources: + ports: 8087 accelerators: A100:1 disk_size: 1024 disk_tier: high diff --git a/examples/serve/http_server/task.yaml b/examples/serve/http_server/task.yaml index e18e0ff9c1b..8b63f6a53a9 100644 --- a/examples/serve/http_server/task.yaml +++ b/examples/serve/http_server/task.yaml @@ -7,13 +7,13 @@ # sky serve status --endpoint http service: - port: 8081 readiness_probe: path: /health initial_delay_seconds: 20 replicas: 2 resources: + ports: 8081 cpus: 2+ workdir: examples/serve/http_server diff --git a/examples/serve/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml index cd1e9f481ba..78f0dc3a811 100644 --- a/examples/serve/llama2/llama2.yaml +++ b/examples/serve/llama2/llama2.yaml @@ -13,11 +13,11 @@ # TODO(tian): Change usage to `HF_TOKEN= sky serve up -n llama2 examples/serve/llama2/llama2.yaml --env HF_TOKEN` once we have `--env` enabled. service: - port: 8087 readiness_probe: /v1/models replicas: 2 resources: + ports: 8087 memory: 32+ accelerators: T4:1 disk_size: 1024 diff --git a/examples/serve/misc/cancel/service.yaml b/examples/serve/misc/cancel/service.yaml index 1da3fb9f086..d4f745d935f 100644 --- a/examples/serve/misc/cancel/service.yaml +++ b/examples/serve/misc/cancel/service.yaml @@ -1,12 +1,12 @@ # Usage: Please refer to the README.md in this directory. service: - port: 9000 readiness_probe: path: /health initial_delay_seconds: 120 resources: + ports: 9000 cpus: 2+ workdir: examples/serve/misc/cancel diff --git a/examples/serve/ray_serve/ray_serve.yaml b/examples/serve/ray_serve/ray_serve.yaml index efcffb37110..1a2ee8ea65d 100644 --- a/examples/serve/ray_serve/ray_serve.yaml +++ b/examples/serve/ray_serve/ray_serve.yaml @@ -4,11 +4,11 @@ # sky serve up examples/serve/ray_serve/ray_serve.yaml service: - port: 8000 readiness_probe: / replicas: 1 resources: + ports: 8000 cpus: 2+ workdir: examples/serve/ray_serve diff --git a/examples/serve/stable_diffusion_service.yaml b/examples/serve/stable_diffusion_service.yaml index c3671fa29dd..86ef257e7ca 100644 --- a/examples/serve/stable_diffusion_service.yaml +++ b/examples/serve/stable_diffusion_service.yaml @@ -7,11 +7,11 @@ # sky serve status --endpoint sd service: - port: 7860 readiness_probe: / replicas: 2 resources: + ports: 7860 accelerators: V100:1 file_mounts: diff --git a/examples/serve/tgi_coder.yaml b/examples/serve/tgi_coder.yaml index 4363f0cf60c..8f2c1d78358 100644 --- a/examples/serve/tgi_coder.yaml +++ b/examples/serve/tgi_coder.yaml @@ -7,11 +7,11 @@ # sky serve status --endpoint tgi service: - port: 8082 readiness_probe: /health replicas: 2 resources: + ports: 8082 accelerators: A100:1 # TODO(tian): Maybe use some small model like 3b. diff --git a/examples/serve/vicuna-v1.5.yaml b/examples/serve/vicuna-v1.5.yaml index 8b81b014cee..689c38b9263 100644 --- a/examples/serve/vicuna-v1.5.yaml +++ b/examples/serve/vicuna-v1.5.yaml @@ -7,11 +7,11 @@ # sky serve status --endpoint vicuna service: - port: 8087 readiness_probe: /v1/models replicas: 2 resources: + ports: 8087 accelerators: A100:1 disk_size: 1024 disk_tier: high diff --git a/examples/serve/vllm.yaml b/examples/serve/vllm.yaml index de2e16225bc..f6c477ca12f 100644 --- a/examples/serve/vllm.yaml +++ b/examples/serve/vllm.yaml @@ -7,7 +7,6 @@ # sky serve status --endpoint vllm service: - port: 8081 readiness_probe: path: /v1/models # vllm takes 5-10 minutes to install @@ -15,6 +14,7 @@ service: replicas: 2 resources: + ports: 8081 accelerators: A100:1 setup: | diff --git a/sky/cli.py b/sky/cli.py index 08d353a69d3..734627cd34b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4225,11 +4225,11 @@ def serve_up( 'To fix, add a valid `service` field.') assert len(task.resources) == 1 requested_resources = list(task.resources)[0] - if requested_resources.ports is not None: + if requested_resources.ports is None or len(requested_resources.ports) != 1: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Specifying ports in resources is not allowed. Each replica ' - 'will use the port specified in the service section.') + 'Must only specify one port in resources. Each replica ' + 'will use the port specified as application ingress port.') click.secho('Service Spec:', fg='cyan') click.echo(task.service) diff --git a/sky/execution.py b/sky/execution.py index 2288a2a6122..6a09df145ce 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1007,14 +1007,11 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: assert len(task.resources) == 1, task requested_resources = list(task.resources)[0] - if requested_resources.ports is not None: + if requested_resources.ports is None or len(requested_resources.ports) != 1: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Specifying ports in resources is not allowed. SkyServe will ' - 'use the port specified in the service section.') - - task.set_resources( - requested_resources.copy(ports=[task.service.replica_port])) + 'Must only specify one port in resources. Each replica ' + 'will use the port specified as application ingress port.') with rich_utils.safe_status( '[cyan]Registering service on the controller[/]'): @@ -1118,7 +1115,7 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: '\nTo see detailed info:\t\t' f'{backend_utils.BOLD}sky serve status {sn} (-a)' f'{backend_utils.RESET_BOLD}' - '\nTo teardown the service:\t\t' + '\nTo teardown the service:\t' f'{backend_utils.BOLD}sky serve down {sn}' f'{backend_utils.RESET_BOLD}' '\n' diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 6281b3b4a0b..95bb97105d9 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -95,8 +95,8 @@ async def load_balancer_sync(request: fastapi.Request): f'Received inflight request information: {request_aggregator}') self._autoscaler.collect_request_information(request_aggregator) return { - 'ready_replica_ips': - self._replica_manager.get_ready_replica_ips() + 'ready_replica_urls': + self._replica_manager.get_ready_replica_urls() } @self._app.on_event('startup') diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index b39bcf5ac2d..28722a72db9 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -26,19 +26,16 @@ class SkyServeLoadBalancer: `curl -L`. """ - def __init__(self, controller_url: str, load_balancer_port: int, - replica_port: int) -> None: + def __init__(self, controller_url: str, load_balancer_port: int) -> None: """Initialize the load balancer. Args: controller_url: The URL of the controller. load_balancer_port: The port where the load balancer listens to. - replica_port: The port where the replica app listens to. """ self._app = fastapi.FastAPI() self._controller_url = controller_url self._load_balancer_port = load_balancer_port - self._replica_port = replica_port self._load_balancing_policy: lb_policies.LoadBalancingPolicy = ( lb_policies.RoundRobinPolicy()) self._request_aggregator: serve_utils.RequestsAggregator = ( @@ -70,26 +67,27 @@ def _sync_with_controller(self): # Clean up after reporting request information to avoid OOM. self._request_aggregator.clear() response.raise_for_status() - ready_replica_ips = response.json().get('ready_replica_ips') + ready_replica_urls = response.json().get( + 'ready_replica_urls') except requests.RequestException as e: print(f'An error occurred: {e}') else: - logger.info(f'Available Replica IPs: {ready_replica_ips}') + logger.info(f'Available Replica URLs: {ready_replica_urls}') self._load_balancing_policy.set_ready_replicas( - ready_replica_ips) + ready_replica_urls) time.sleep(constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS) async def _redirect_handler(self, request: fastapi.Request): self._request_aggregator.add(request) - replica_ip = self._load_balancing_policy.select_replica(request) + ready_replica_url = self._load_balancing_policy.select_replica(request) - if replica_ip is None: + if ready_replica_url is None: raise fastapi.HTTPException(status_code=503, detail='No available replicas. ' 'Use "sky serve status [SERVICE_ID]" ' 'to check the replica status.') - path = f'http://{replica_ip}:{self._replica_port}{request.url.path}' + path = f'http://{ready_replica_url}{request.url.path}' logger.info(f'Redirecting request to {path}') return fastapi.responses.RedirectResponse(url=path) @@ -112,9 +110,7 @@ def configure_logger(): uvicorn.run(self._app, host='0.0.0.0', port=self._load_balancer_port) -def run_load_balancer(controller_addr: str, load_balancer_port: int, - replica_port: int): +def run_load_balancer(controller_addr: str, load_balancer_port: int): load_balancer = SkyServeLoadBalancer(controller_url=controller_addr, - load_balancer_port=load_balancer_port, - replica_port=replica_port) + load_balancer_port=load_balancer_port) load_balancer.run() diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index 76f6f42ab7d..6bf9e0e44a2 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -43,7 +43,7 @@ def set_ready_replicas(self, ready_replicas: List[str]) -> None: def select_replica(self, request: fastapi.Request) -> Optional[str]: if not self.ready_replicas: return None - replica_ip = self.ready_replicas[self.index] + ready_replica_url = self.ready_replicas[self.index] self.index = (self.index + 1) % len(self.ready_replicas) request_repr = (' Optional[str]: f'headers={dict(request.headers)} ' f'query_params={dict(request.query_params)}' '>') - logger.info(f'Selected replica {replica_ip} for request {request_repr}') - return replica_ip + logger.info(f'Selected replica {ready_replica_url} ' + f'for request {request_repr}') + return ready_replica_url diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 7e00e3a16ae..d6006e534a3 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -146,6 +146,16 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None: time.sleep(gap_seconds) +def _get_resources_ports(task_yaml: str) -> str: + """Get the resources ports used by the task.""" + task = sky.Task.from_yaml(task_yaml) + assert len(task.resources) == 1, task + task_resources = list(task.resources)[0] + # Already checked the resources have and only have one port + # before upload the task yaml. + return task_resources.ports[0] + + def with_lock(func): @functools.wraps(func) @@ -302,9 +312,11 @@ def to_replica_status(self) -> serve_state.ReplicaStatus: class ReplicaInfo: """Replica info for each replica.""" - def __init__(self, replica_id: int, cluster_name: str) -> None: + def __init__(self, replica_id: int, cluster_name: str, + replica_port: str) -> None: self.replica_id: int = replica_id self.cluster_name: str = cluster_name + self.replica_port: str = replica_port self.first_not_ready_time: Optional[float] = None self.consecutive_failure_times: List[float] = [] self.status_property: ReplicaStatusProperty = ReplicaStatusProperty() @@ -330,11 +342,11 @@ def handle( return handle @property - def ip(self) -> Optional[str]: + def url(self) -> Optional[str]: handle = self.handle() if handle is None: return None - return handle.head_ip + return f'{handle.head_ip}:{self.replica_port}' @property def status(self) -> serve_state.ReplicaStatus: @@ -360,7 +372,7 @@ def to_info_dict(self, with_handle: bool) -> Dict[str, Any]: def probe( self, - readiness_route: str, + readiness_path: str, post_data: Optional[Dict[str, Any]], ) -> Tuple['ReplicaInfo', bool, float]: """Probe the readiness of the replica. @@ -368,12 +380,12 @@ def probe( Returns: Tuple of (self, is_ready, probe_time). """ - replica_identity = f'replica {self.replica_id} with ip {self.ip}' + replica_identity = f'replica {self.replica_id} with url {self.url}' probe_time = time.time() try: msg = '' # TODO(tian): Support HTTPS in the future. - readiness_path = f'http://{self.ip}{readiness_route}' + readiness_path = (f'http://{self.url}{readiness_path}') if post_data is not None: msg += 'POST' response = requests.post( @@ -410,15 +422,15 @@ def __init__(self, service_name: str, self._next_replica_id: int = 1 self._service_name: str = service_name self._auto_restart = spec.auto_restart - self._readiness_route: str = spec.readiness_route + self._readiness_path: str = spec.readiness_path self._initial_delay_seconds: int = spec.initial_delay_seconds self._post_data: Optional[Dict[str, Any]] = spec.post_data self._uptime: Optional[float] = None - logger.info(f'Readiness probe suffix: {self._readiness_route}\n' + logger.info(f'Readiness probe path: {self._readiness_path}\n' f'Initial delay seconds: {self._initial_delay_seconds}\n' f'Post data: {self._post_data}') - def get_ready_replica_ips(self) -> List[str]: + def get_ready_replica_urls(self) -> List[str]: """Get all ready replica's IP addresses.""" raise NotImplementedError @@ -460,14 +472,14 @@ def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', # Replica management functions # ################################ - def get_ready_replica_ips(self) -> List[str]: - ready_replicas = [] + def get_ready_replica_urls(self) -> List[str]: + ready_replica_urls = [] infos = serve_state.get_replica_infos(self._service_name) for info in infos: if info.status == serve_state.ReplicaStatus.READY: - assert info.ip is not None - ready_replicas.append(info.ip) - return ready_replicas + assert info.url is not None + ready_replica_urls.append(info.url) + return ready_replica_urls def _launch_replica(self, replica_id: int) -> None: if replica_id in self._launch_process_pool: @@ -489,7 +501,8 @@ def _launch_replica(self, replica_id: int) -> None: # Don't start right now; we will start it later in _refresh_process_pool # to avoid too many sky.launch running at the same time. self._launch_process_pool[replica_id] = p - info = ReplicaInfo(replica_id, cluster_name) + replica_port = _get_resources_ports(self._task_yaml_path) + info = ReplicaInfo(replica_id, cluster_name, replica_port) serve_state.add_or_update_replica(self._service_name, replica_id, info) def scale_up(self, n: int) -> None: @@ -757,10 +770,10 @@ def _probe_all_replicas(self) -> None: if not info.status_property.should_track_status(): continue replica_to_probe.append( - f'replica_{info.replica_id}(ip={info.ip})') + f'replica_{info.replica_id}(url={info.url})') probe_futures.append( pool.apply_async(info.probe, - (self._readiness_route, self._post_data))) + (self._readiness_path, self._post_data))) logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}') # Since futures.as_completed will return futures in the order of diff --git a/sky/serve/service.py b/sky/serve/service.py index e44a1bd452e..8e605e57f30 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -174,7 +174,6 @@ def _start(service_name: str, task_yaml: str, job_id: int): # TODO(tian): Support HTTPS. controller_addr = f'http://localhost:{controller_port}' - replica_port = int(service_spec.replica_port) load_balancer_port = common_utils.find_free_port( constants.LOAD_BALANCER_PORT_START) @@ -186,7 +185,7 @@ def _start(service_name: str, task_yaml: str, job_id: int): target=ux_utils.RedirectOutputForProcess( load_balancer.run_load_balancer, load_balancer_log_file).run, - args=(controller_addr, load_balancer_port, replica_port)) + args=(controller_addr, load_balancer_port)) load_balancer_process.start() serve_state.set_service_load_balancer_port(service_name, load_balancer_port) diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py index 1b3a9e93aa7..6acd5a983bc 100644 --- a/sky/serve/service_spec.py +++ b/sky/serve/service_spec.py @@ -19,7 +19,6 @@ def __init__( self, readiness_path: str, initial_delay_seconds: int, - replica_port: int, min_replicas: int, max_replicas: Optional[int] = None, qps_upper_threshold: Optional[float] = None, @@ -42,12 +41,6 @@ def __init__( f'Got: {readiness_path}') self._readiness_path = readiness_path self._initial_delay_seconds = initial_delay_seconds - if replica_port < 0 or replica_port > 65535: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'Invalid app port: {replica_port}. ' - 'Please use a port number between 0 and 65535.') - self._replica_port = str(replica_port) self._min_replicas = min_replicas self._max_replicas = max_replicas self._qps_upper_threshold = qps_upper_threshold @@ -65,8 +58,7 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': 'Cannot specify both `replicas` and `replica_policy` in ' 'the service YAML. Please use one of them.') - service_config = {} - service_config['replica_port'] = config['port'] + service_config: Dict[str, Any] = {} readiness_section = config['readiness_probe'] if isinstance(readiness_section, str): @@ -79,8 +71,7 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec': 'initial_delay_seconds', None) post_data = readiness_section.get('post_data', None) if initial_delay_seconds is None: - ids = constants.DEFAULT_INITIAL_DELAY_SECONDS - initial_delay_seconds = ids + initial_delay_seconds = constants.DEFAULT_INITIAL_DELAY_SECONDS service_config['initial_delay_seconds'] = initial_delay_seconds if isinstance(post_data, str): try: @@ -152,7 +143,6 @@ def add_if_not_none(section, key, value, no_empty: bool = False): config[section] = dict() config[section][key] = value - add_if_not_none('port', None, int(self.replica_port)) add_if_not_none('readiness_probe', 'path', self.readiness_path) add_if_not_none('readiness_probe', 'initial_delay_seconds', self.initial_delay_seconds) @@ -189,10 +179,6 @@ def __repr__(self) -> str: Replica auto restart: {self.auto_restart}\ """) - @property - def readiness_route(self) -> str: - return f':{self._replica_port}{self._readiness_path}' - @property def readiness_path(self) -> str: return self._readiness_path @@ -201,10 +187,6 @@ def readiness_path(self) -> str: def initial_delay_seconds(self) -> int: return self._initial_delay_seconds - @property - def replica_port(self) -> str: - return self._replica_port - @property def min_replicas(self) -> int: return self._min_replicas diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 2b442473275..4acdde3abe8 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -160,12 +160,9 @@ def get_service_schema(): return { '$schema': 'https://json-schema.org/draft/2020-12/schema', 'type': 'object', - 'required': ['port', 'readiness_probe'], + 'required': ['readiness_probe'], 'additionalProperties': False, 'properties': { - 'port': { - 'type': 'integer', - }, 'readiness_probe': { 'anyOf': [{ 'type': 'string', diff --git a/tests/skyserve/auto_restart.yaml b/tests/skyserve/auto_restart.yaml index 5cd60653b76..0b440753902 100644 --- a/tests/skyserve/auto_restart.yaml +++ b/tests/skyserve/auto_restart.yaml @@ -1,4 +1,12 @@ +service: + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 1 + + resources: + ports: 8080 cloud: gcp zone: us-central1-a cpus: 2+ @@ -6,10 +14,3 @@ resources: workdir: examples/serve/http_server run: python3 server.py --port 8080 - -service: - port: 8080 - readiness_probe: - path: /health - initial_delay_seconds: 20 - replicas: 1 diff --git a/tests/skyserve/http/aws.yaml b/tests/skyserve/http/aws.yaml index 2faff262427..cd7217b3d61 100644 --- a/tests/skyserve/http/aws.yaml +++ b/tests/skyserve/http/aws.yaml @@ -1,14 +1,14 @@ +service: + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 2 + resources: + ports: 8081 cloud: aws cpus: 2+ workdir: examples/serve/http_server run: python3 server.py - -service: - port: 8081 - readiness_probe: - path: /health - initial_delay_seconds: 20 - replicas: 2 diff --git a/tests/skyserve/http/azure.yaml b/tests/skyserve/http/azure.yaml index 2b67158fcd1..fa124bc700d 100644 --- a/tests/skyserve/http/azure.yaml +++ b/tests/skyserve/http/azure.yaml @@ -1,4 +1,11 @@ +service: + readiness_probe: + path: /health + initial_delay_seconds: 200 + replicas: 2 + resources: + ports: 8081 cloud: azure cpus: 2+ @@ -6,10 +13,3 @@ workdir: examples/serve/http_server # Use 8081 to test jupyterhub service is terminated run: python3 server.py --port 8081 - -service: - port: 8081 - readiness_probe: - path: /health - initial_delay_seconds: 200 - replicas: 2 diff --git a/tests/skyserve/http/gcp.yaml b/tests/skyserve/http/gcp.yaml index abefcc6563a..4ab431fba75 100644 --- a/tests/skyserve/http/gcp.yaml +++ b/tests/skyserve/http/gcp.yaml @@ -1,4 +1,11 @@ +service: + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 2 + resources: + ports: 8080 cloud: gcp cpus: 2+ @@ -6,10 +13,3 @@ workdir: examples/serve/http_server # Use 8080 to test jupyter service is terminated run: python3 server.py --port 8080 - -service: - port: 8080 - readiness_probe: - path: /health - initial_delay_seconds: 20 - replicas: 2 diff --git a/tests/skyserve/llm/service.yaml b/tests/skyserve/llm/service.yaml index 42f0a83af02..48b5344db59 100644 --- a/tests/skyserve/llm/service.yaml +++ b/tests/skyserve/llm/service.yaml @@ -1,13 +1,13 @@ +service: + readiness_probe: /v1/models + replicas: 1 + resources: + ports: 8087 cloud: gcp accelerators: T4 memory: 32+ -service: - port: 8087 - readiness_probe: /v1/models - replicas: 1 - setup: | conda activate chatbot if [ $? -ne 0 ]; then diff --git a/tests/skyserve/replica_failure/service.yaml b/tests/skyserve/replica_failure/service.yaml index 7951f5696f8..aa75f4336ee 100644 --- a/tests/skyserve/replica_failure/service.yaml +++ b/tests/skyserve/replica_failure/service.yaml @@ -1,4 +1,14 @@ +service: + readiness_probe: + path: /health + # For install dependencies + initial_delay_seconds: 180 + replica_policy: + min_replicas: 3 + auto_restart: false + resources: + ports: 8080 cloud: gcp zone: us-central1-a cpus: 2+ @@ -8,13 +18,3 @@ workdir: tests/skyserve/replica_failure setup: pip install fastapi[all] uvicorn run: python3 server.py --port 8080 - -service: - port: 8080 - readiness_probe: - path: /health - # For install dependencies - initial_delay_seconds: 180 - replica_policy: - min_replicas: 3 - auto_restart: false diff --git a/tests/skyserve/spot/recovery.yaml b/tests/skyserve/spot/recovery.yaml index 3eefd1bca8a..6cbd69b306f 100644 --- a/tests/skyserve/spot/recovery.yaml +++ b/tests/skyserve/spot/recovery.yaml @@ -1,4 +1,12 @@ +service: + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 1 + + resources: + ports: 8080 cloud: gcp cpus: 2+ zone: us-central1-a @@ -8,10 +16,3 @@ workdir: examples/serve/http_server # Use 8080 to test jupyter service is terminated run: python3 server.py --port 8080 - -service: - port: 8080 - readiness_probe: - path: /health - initial_delay_seconds: 20 - replicas: 1 diff --git a/tests/skyserve/spot/user_bug.yaml b/tests/skyserve/spot/user_bug.yaml index da1e525afc2..fe12394ea77 100644 --- a/tests/skyserve/spot/user_bug.yaml +++ b/tests/skyserve/spot/user_bug.yaml @@ -1,4 +1,12 @@ +service: + readiness_probe: + path: /health + initial_delay_seconds: 20 + replicas: 1 + + resources: + ports: 8080 cloud: gcp cpus: 2+ zone: us-central1-a @@ -7,10 +15,3 @@ resources: workdir: tests/skyserve/spot run: python3 user_bug.py - -service: - port: 8080 - readiness_probe: - path: /health - initial_delay_seconds: 20 - replicas: 1 From f2b4c29ee1805855dbb5e9f700d192c8241520f9 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 01:56:04 -0800 Subject: [PATCH 192/223] add cancel test --- tests/skyserve/cancel/cancel.yaml | 16 +++++++++ tests/skyserve/cancel/send_cancel_request.py | 38 ++++++++++++++++++++ tests/test_smoke.py | 22 ++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 tests/skyserve/cancel/cancel.yaml create mode 100644 tests/skyserve/cancel/send_cancel_request.py diff --git a/tests/skyserve/cancel/cancel.yaml b/tests/skyserve/cancel/cancel.yaml new file mode 100644 index 00000000000..1e4007878fb --- /dev/null +++ b/tests/skyserve/cancel/cancel.yaml @@ -0,0 +1,16 @@ +# This is copied from examples/serve/misc/cancel/service.yaml, but with +# cloud set to gcp. + +service: + readiness_probe: + path: /health + initial_delay_seconds: 120 + +resources: + ports: 9000 + cloud: gcp + cpus: 2+ + +workdir: examples/serve/misc/cancel + +run: python3 server.py --port 9000 diff --git a/tests/skyserve/cancel/send_cancel_request.py b/tests/skyserve/cancel/send_cancel_request.py new file mode 100644 index 00000000000..48c2b2bec63 --- /dev/null +++ b/tests/skyserve/cancel/send_cancel_request.py @@ -0,0 +1,38 @@ +import argparse +import asyncio + +import aiohttp + +parser = argparse.ArgumentParser() +parser.add_argument('--endpoint', type=str, required=True) +args = parser.parse_args() + + +async def fetch(session, url): + try: + async with session.get(url) as response: + print('Got response!') + return await response.text() + except asyncio.CancelledError: + print('Request was cancelled!') + raise + + +async def main(): + timeout = 2 + + async with aiohttp.ClientSession() as session: + task = asyncio.create_task(fetch(session, f'http://{args.endpoint}/')) + + await asyncio.sleep(timeout) + # We manually cancel requests for test purposes. + # You could also manually Ctrl + C a curl to cancel a request. + task.cancel() + + try: + await task + except asyncio.CancelledError: + print('Main function caught the cancelled exception.') + + +asyncio.run(main()) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 0609b831753..59bd194304d 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2896,6 +2896,28 @@ def terminate_replica(replica_id: int) -> str: run_one_test(test) +@pytest.mark.gcp +@pytest.mark.sky_serve +def test_skyserve_cancel(): + """Test skyserve with cancel""" + name = _get_service_name() + + test = Test( + f'test-skyserve-cancel', + [ + f'sky serve up -n {name} -y tests/skyserve/cancel/cancel.yaml', + _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{_get_serve_endpoint(name)}; python3 ' + 'tests/skyserve/cancel/send_cancel_request.py ' + '--endpoint $endpoint | grep "Request was cancelled"', + f'sky serve logs {name} 1 --no-follow | grep "Client disconnected, stopping computation"', + ], + f'sky serve down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + # ------- Testing user ray cluster -------- @pytest.mark.no_kubernetes # Kubernetes does not support sky status -r yet. def test_user_ray_cluster(generic_cloud: str): From b68a1fd33a89c7c91d71c2d3c3ab02df76cce1ae Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 08:43:25 -0800 Subject: [PATCH 193/223] fix controller resources cloud not specified --- sky/execution.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 6a09df145ce..c4336f59022 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1075,11 +1075,12 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: vars_to_fill, output_path=controller_file.name) controller_task = task_lib.Task.from_yaml(controller_file.name) - # Choose the same cloud if controller is not launched, controller - # resources not specify cloud and replica cloud is specified. - controller_cloud = (requested_resources.cloud if status is None and - controller_resources.cloud is None and - requested_resources.cloud is not None else None) + # Choose the same cloud if controller is not launched and the + # controller resources cloud is not specified. + if (status is None and controller_resources.cloud is None): + controller_cloud = requested_resources.cloud + else: + controller_cloud = controller_resources.cloud # TODO(tian): Probably run another sky.launch after we get the load # balancer port from the controller? So we don't need to open so many # ports here. Or, we should have a nginx traffic control to refuse From 81a88d25c50a492d62e359330d9858b75a3f7efd Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 12:31:20 -0800 Subject: [PATCH 194/223] ux --- sky/cli.py | 59 +++++++++++++++++++++-------- sky/execution.py | 5 ++- sky/serve/serve_utils.py | 23 ++++++----- sky/utils/cli_utils/status_utils.py | 2 +- sky/utils/controller_utils.py | 12 +++--- 5 files changed, 69 insertions(+), 32 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 734627cd34b..9c4d67e44b9 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1674,8 +1674,22 @@ def _get_spot_jobs( def _get_services(service_names: Optional[List[str]], show_all: bool, show_endpoint: bool, - is_called_by_user: bool = False) -> str: - msg = None + is_called_by_user: bool = False) -> Tuple[Optional[int], str]: + """Get service statuses. + + Args: + service_names: If not None, only show the statuses of these services. + show_all: Show all information of each service. + show_endpoint: If True, only show the endpoint of the service. + is_called_by_user: If this function is called by user directly, or an + internal call. + + Returns: + A tuple of (num_services, msg). If num_services is None, it means there + is an error when querying the services. In this case, msg contains the + error message. Otherwise, msg contains the formatted service table. + """ + num_services = None try: if not is_called_by_user: usage_lib.messages.usage.set_internal() @@ -1684,16 +1698,16 @@ def _get_services(service_names: Optional[List[str]], # Change empty list to None service_names = None service_records = core.serve_status(service_names) + num_services = len(service_records) except exceptions.ClusterNotUpError as e: controller_status = e.cluster_status if controller_status == status_lib.ClusterStatus.INIT: msg = 'Controller is initializing. Please wait for a while.' else: assert controller_status in [None, status_lib.ClusterStatus.STOPPED] - msg = 'No existing services.' - if controller_status is None: - msg += (f' (See: {colorama.Style.BRIGHT}sky serve -h' - f'{colorama.Style.RESET_ALL})') + msg = ('No existing services. (See: ' + f'{colorama.Style.BRIGHT}sky serve -h' + f'{colorama.Style.RESET_ALL})') except RuntimeError as e: msg = ('Failed to fetch service statuses due to connection issues. ' 'Please try again later. Details: ' @@ -1714,7 +1728,16 @@ def _get_services(service_names: Optional[List[str]], msg = status_utils.get_endpoint(service_records[0]) else: msg = status_utils.format_service_table(service_records, show_all) - return msg + service_not_found_msg = '' + if service_names is not None: + for service_name in service_names: + if not any(service_name == record['name'] + for record in service_records): + service_not_found_msg += ( + f'\nService {service_name!r} not found.') + if service_not_found_msg: + msg += f'\n{service_not_found_msg}' + return num_services, msg @cli.command() @@ -1894,7 +1917,6 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, cluster_name) if controller is not None: reserved_clusters.append(cluster_record) - hints.append(controller.value.sky_status_hint) else: nonreserved_cluster_records.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( @@ -1948,8 +1970,8 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: 'shown)') job_info += '. ' hints.append( - f'* {job_info}To see all spot jobs: {colorama.Style.BRIGHT}' - f'sky spot queue{colorama.Style.RESET_ALL}') + controller_utils.Controllers.SPOT_CONTROLLER.value. + in_progress_hint.format(job_info=job_info)) if show_services: click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' @@ -1959,10 +1981,17 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: msg = 'KeyboardInterrupt' else: with rich_utils.safe_status('[cyan]Checking services[/]'): - interrupted, msg = _try_get_future_result(services_future) + interrupted, result = _try_get_future_result( + services_future) if interrupted: + num_services = -1 msg = 'KeyboardInterrupt' + else: + num_services, msg = result click.echo(msg) + if num_services is not None: + hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER. + value.in_progress_hint) if show_spot_jobs or show_services: try: @@ -4346,10 +4375,10 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): """ # This won't pollute the output of --endpoint. with rich_utils.safe_status('[cyan]Checking services[/]'): - msg = _get_services(service_names, - show_all=all, - show_endpoint=endpoint, - is_called_by_user=True) + _, msg = _get_services(service_names, + show_all=all, + show_endpoint=endpoint, + is_called_by_user=True) if not endpoint: click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' diff --git a/sky/execution.py b/sky/execution.py index c4336f59022..b8aff024fbc 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1107,6 +1107,8 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: ) # TODO(tian): Use this to check the endpoint and if the # service name is duplicated. + # TODO(tian): Cache endpoint locally to speedup. Endpoint won't + # change after the first time, so there is no consistency issue. sn = service_name style = colorama.Style fore = colorama.Fore @@ -1136,7 +1138,8 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: '\nTo send a test request:\t\t' f'{backend_utils.BOLD}curl -L $(sky serve status {sn} ' f'--endpoint){backend_utils.RESET_BOLD}' - f'\n{fore.GREEN}SkyServe is bootstrapping your service now.' + '\n' + f'\n{fore.GREEN}SkyServe is spinning up your service now.' f'{style.RESET_ALL}' f'\n{fore.GREEN}The endpoint and replicas should be ready ' f'within a short time.{style.RESET_ALL}') diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index e5ddb1eb6ba..cd167f9080a 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -262,8 +262,9 @@ def load_add_service_result(payload: str) -> bool: return common_utils.decode_payload(payload) -def get_serve_status(service_name: str, - with_replica_info: bool = True) -> Dict[str, Any]: +def _get_serve_status( + service_name: str, + with_replica_info: bool = True) -> Optional[Dict[str, Any]]: """Get the status dict of the service. Args: @@ -271,11 +272,12 @@ def get_serve_status(service_name: str, with_replica_info: Whether to include the information of all replicas. Returns: - A dictionary, describing the status of the service. + A dictionary describing the status of the service if the service exists. + Otherwise, return None. """ record = serve_state.get_service_from_name(service_name) if record is None: - raise ValueError(f'Service {service_name!r} does not exist.') + return None if with_replica_info: record['replica_info'] = [ info.to_info_dict(with_handle=True) @@ -290,7 +292,9 @@ def get_serve_status_encoded(service_names: Optional[List[str]]) -> str: # Get all service names service_names = serve_state.get_glob_service_names(None) for service_name in service_names: - serve_status = get_serve_status(service_name) + serve_status = _get_serve_status(service_name) + if serve_status is None: + continue serve_statuses.append({ k: base64.b64encode(pickle.dumps(v)).decode('utf-8') for k, v in serve_status.items() @@ -313,7 +317,8 @@ def terminate_services(service_names: Optional[List[str]]) -> str: service_names = serve_state.get_glob_service_names(service_names) terminated_service_names = [] for service_name in service_names: - serve_status = get_serve_status(service_name, with_replica_info=False) + serve_status = _get_serve_status(service_name, with_replica_info=False) + assert serve_status is not None if (serve_status['status'] in serve_state.ServiceStatus.refuse_to_terminate_statuses()): # TODO(tian): Cleanup replicas for CONTROLLER_FAILED status. Seems @@ -330,13 +335,13 @@ def terminate_services(service_names: Optional[List[str]]) -> str: # to the file? It will be helpful for update cases. f.write(UserSignal.TERMINATE.value) f.flush() - terminated_service_names.append(service_name) + terminated_service_names.append(f'{service_name!r}') if len(terminated_service_names) == 0: return 'No service to terminate.' - identity_str = f'Service with name {terminated_service_names[0]} is' + identity_str = f'Service {terminated_service_names[0]} is' if len(terminated_service_names) > 1: terminated_service_names_str = ', '.join(terminated_service_names) - identity_str = f'Services with names {terminated_service_names_str} are' + identity_str = f'Services {terminated_service_names_str} are' return f'{identity_str} scheduled to be terminated.' diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index f6d0b57ce33..cbf7d960f53 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -150,7 +150,7 @@ def format_service_table(service_records: List[_ServiceRecord], replica_table = format_replica_table(replica_infos, show_all) return (f'{service_table}\n' f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Replicas{colorama.Style.RESET_ALL}\n' + f'Service Replicas{colorama.Style.RESET_ALL}\n' f'{replica_table}') diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 1f902b57172..943bce55630 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -33,7 +33,7 @@ class _ControllerSpec: """Spec for skypilot controllers.""" name: str cluster_name: str - sky_status_hint: str + in_progress_hint: str decline_cancel_hint: str decline_down_in_init_status_hint: str decline_down_for_dirty_controller_hint: str @@ -48,9 +48,9 @@ class Controllers(enum.Enum): SPOT_CONTROLLER = _ControllerSpec( name='managed spot controller', cluster_name=spot.SPOT_CONTROLLER_NAME, - sky_status_hint=( - f'* To see detailed spot job status: {colorama.Style.BRIGHT}' - f'sky spot queue{colorama.Style.RESET_ALL}'), + in_progress_hint=( + '* {job_info}To see all spot jobs: ' + f'{colorama.Style.BRIGHT}sky spot queue{colorama.Style.RESET_ALL}'), decline_cancel_hint=( 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel Date: Fri, 10 Nov 2023 13:18:31 -0800 Subject: [PATCH 195/223] ux for down --- sky/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 9c4d67e44b9..b358491a2b1 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4426,9 +4426,8 @@ def serve_down(service_names: List[str], all: bool, yes: bool): sky serve down -a """ - service_names_str = ','.join(service_names) if sum([len(service_names) > 0, all]) != 1: - argument_str = f'SERVICE_NAMES={service_names_str}' if len( + argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len( service_names) > 0 else '' argument_str += ' --all' if all else '' raise click.UsageError( @@ -4443,7 +4442,8 @@ def serve_down(service_names: List[str], all: bool, yes: bool): sys.exit(1) if not yes: - service_identity_str = f'services with name {service_names_str}' + quoted_service_names = [f'{name!r}' for name in service_names] + service_identity_str = f'service(s) {", ".join(quoted_service_names)}' if all: service_identity_str = 'all services' click.confirm(f'Terminating {service_identity_str}. Proceed?', From f68242cc2e6541d410f427b17059b17e18e19ba7 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 13:31:46 -0800 Subject: [PATCH 196/223] add smoke retry teardown --- tests/test_smoke.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 59bd194304d..1009e398b11 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2644,6 +2644,17 @@ def _get_service_name() -> str: _IP_REGEX = r'([0-9]{1,3}\.){3}[0-9]{1,3}' _ENDPOINT_REGEX = _IP_REGEX + r':[0-9]{1,5}' _AWK_ALL_LINES_BELOW_REPLICAS = r'/Replicas/{flag=1; next} flag' +# Since we don't allow terminate the service if the controller is INIT, +# which is common for simultaneous pytest, we need to wait until the +# controller is UP before we can terminate the service. +# The teardown command has a 10-mins timeout, so we don't need to do +# the timeout here. See implementation of run_one_test() for details. +_TEARDOWN_SERVICE = ( + '(while true; do' + ' output=$(sky serve down -y {name});' + ' echo "$output" | grep -q "scheduled to be terminated" && break;' + ' sleep 10;' + 'done)') def _get_serve_endpoint(name: str) -> str: @@ -2669,7 +2680,7 @@ def _get_skyserve_http_test(name: str, cloud: str, _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', ], - f'sky serve down -y {name}', + _TEARDOWN_SERVICE.format(name=name), timeout=timeout_minutes * 60, ) return test @@ -2728,7 +2739,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: for prompt, output in prompt2output.items() ], ], - f'sky serve down -y {name}', + _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) run_one_test(test) @@ -2759,7 +2770,7 @@ def terminate_replica(replica_id: int) -> str: _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', ], - f'sky serve down -y {name}', + _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) run_one_test(test) @@ -2784,7 +2795,7 @@ def test_skyserve_spot_user_bug(): ' sleep 10;' f'done)', ], - f'sky serve down -y {name}', + _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) run_one_test(test) @@ -2840,7 +2851,7 @@ def terminate_replica(replica_id: int) -> str: 'python tests/skyserve/replica_failure/test_round_robin.py ' '--endpoint $endpoint --replica-num 1 --replica-ips $ip3', ], - f'sky serve down -y {name}', + _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) run_one_test(test) @@ -2890,7 +2901,7 @@ def terminate_replica(replica_id: int) -> str: f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};', f'{_get_serve_endpoint(name)}; curl -L http://$endpoint | grep "Hi, SkyPilot here"', ], - f'sky serve down -y {name}', + _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) run_one_test(test) @@ -2912,7 +2923,7 @@ def test_skyserve_cancel(): '--endpoint $endpoint | grep "Request was cancelled"', f'sky serve logs {name} 1 --no-follow | grep "Client disconnected, stopping computation"', ], - f'sky serve down -y {name}', + _TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, ) run_one_test(test) From 122de45e5c595e26cf01d4e52575e101fb0432fa Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 13:36:53 -0800 Subject: [PATCH 197/223] resolve conflict --- sky/execution.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 20cc505edd8..2d76e70fe7b 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -8,11 +8,7 @@ import os import re import tempfile -<<<<<<< HEAD from typing import Any, Dict, List, Optional, Tuple, Union -======= -from typing import Any, List, Optional, Tuple, Union ->>>>>>> origin/master import uuid import colorama @@ -185,12 +181,8 @@ def _execute( # Internal only: # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, -<<<<<<< HEAD _is_launched_by_sky_serve_controller: bool = False, -) -> None: -======= ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]: ->>>>>>> origin/master """Execute an entrypoint. If sky.Task is given or DAG has not been optimized yet, this will call From 33bd321ac0802c95561e56c5f1d9ea14b6fdb960 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 15:51:56 -0800 Subject: [PATCH 198/223] add endpoint, move check for name conflict to _execute --- sky/cli.py | 7 +- sky/execution.py | 184 +++++++++++++++------------------- sky/serve/__init__.py | 4 +- sky/serve/constants.py | 3 + sky/serve/replica_managers.py | 4 +- sky/serve/serve_state.py | 36 +++---- sky/serve/serve_utils.py | 61 +++++++---- sky/serve/service.py | 70 ++++++++----- 8 files changed, 195 insertions(+), 174 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 22dd2a6cd2d..b803fd65b0d 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1702,9 +1702,10 @@ def _get_services(service_names: Optional[List[str]], msg = 'Controller is initializing. Please wait for a while.' else: assert controller_status in [None, status_lib.ClusterStatus.STOPPED] - msg = ('No existing services. (See: ' - f'{colorama.Style.BRIGHT}sky serve -h' - f'{colorama.Style.RESET_ALL})') + msg = 'No existing services. ' + if controller_status is None: + msg += (f'(See: {colorama.Style.BRIGHT}sky serve -h' + f'{colorama.Style.RESET_ALL})') except RuntimeError as e: msg = ('Failed to fetch service statuses due to connection issues. ' 'Please try again later. Details: ' diff --git a/sky/execution.py b/sky/execution.py index 2d76e70fe7b..0e5e8f54bd6 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -12,7 +12,6 @@ import uuid import colorama -import filelock import sky from sky import backends @@ -72,8 +71,6 @@ '{controller_type}.controller.resources is a valid resources spec. ' 'Details:\n {err}') -_SERVE_UP_NAME_LOCK_PATH = '/tmp/sky_serve_up_{}.lock' - def _convert_to_dag(entrypoint: Any) -> 'sky.Dag': """Convert the entrypoint to a sky.Dag. @@ -1010,27 +1007,33 @@ def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task, storage_obj.force_delete = True -def _register_service_name(service_name: str, - handle: backends.CloudVmRayResourceHandle) -> bool: - """Register a service name on the controller if it is running. +@usage_lib.entrypoint +def serve_up( + task: 'sky.Task', + service_name: Optional[str] = None, +) -> None: + """Spin up a service. + + Please refer to the sky.cli.serve_up for the document. - Returns: - True if the service name is not occupied, False otherwise. + Args: + task: sky.Task to serve up. + service_name: Name of the service. """ - code = serve.ServeCodeGen.add_service_if_not_exist(service_name) - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend) - returncode, stdout, _ = backend.run_on_head(handle, - code, - require_outputs=True, - stream_logs=False) - subprocess_utils.handle_returncode( - returncode, code, 'Failed to register service name on controller', - stdout) - return serve.load_add_service_result(stdout) - - -def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: + if service_name is None: + service_name = serve.generate_service_name() + + # The service name will be used as: + # 1. controller cluster name: 'sky-serve-controller-' + # 2. replica cluster name: '-' + # In both cases, service name shares the same regex with cluster name. + if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service name {service_name!r} is invalid: ' + f'ensure it is fully matched by regex (e.g., ' + 'only contains lower letters, numbers and dash): ' + f'{constants.CLUSTER_NAME_VALID_REGEX}') + if task.service is None: with ux_utils.print_exception_no_traceback(): raise RuntimeError('Service section not found.') @@ -1043,33 +1046,6 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: 'Must only specify one port in resources. Each replica ' 'will use the port specified as application ingress port.') - with rich_utils.safe_status( - '[cyan]Registering service on the controller[/]'): - with sky_logging.silent(): - status, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers. - SKY_SERVE_CONTROLLER, - stopped_message='') - if handle is None or handle.head_ip is None: - # The sky serve controller is STOPPED, or it is the first time - # provisioning either after an AUTOSTOP, or the first time the - # controller is created, which means there is no service on the - # controller. We will create the service database record in - # sky.serve.service._start once the controller is running. - logger.info('The sky serve controller is not running. ' - 'Will register the service once the controller is up.') - else: - # The sky serve controller is UP, check if the service exists. - success = _register_service_name(service_name, handle) - if not success: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'The service {service_name!r} is already running. ' - 'Please specify a different name for your service. ' - 'To update an existing service, run: `sky serve down` ' - 'and then `sky serve up` again (in-place update will ' - 'be supported in the future).') - _maybe_translate_local_file_mounts_and_sync_up(task, path='serve') with tempfile.NamedTemporaryFile( @@ -1082,8 +1058,8 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: controller_name = serve.SKY_SERVE_CONTROLLER_NAME task_config = task.to_yaml_config() common_utils.dump_yaml(service_file.name, task_config) - remote_task_yaml_path = ( - serve.generate_remote_task_yaml_file_name(service_name)) + remote_tmp_task_yaml_path = ( + serve.generate_remote_tmp_task_yaml_file_name(service_name)) remote_config_yaml_path = ( serve.generate_remote_config_yaml_file_name(service_name)) controller_log_file = ( @@ -1093,7 +1069,7 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: controller_resources_config=serve.CONTROLLER_RESOURCES, remote_user_config_path=remote_config_yaml_path) vars_to_fill = { - 'remote_task_yaml_path': remote_task_yaml_path, + 'remote_task_yaml_path': remote_tmp_task_yaml_path, 'local_task_yaml_path': service_file.name, 'google_sdk_installation_commands': gcp.GOOGLE_SDK_INSTALLATION_COMMAND, @@ -1105,12 +1081,9 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: vars_to_fill, output_path=controller_file.name) controller_task = task_lib.Task.from_yaml(controller_file.name) - # Choose the same cloud if controller is not launched and the - # controller resources cloud is not specified. - if (status is None and controller_resources.cloud is None): - controller_cloud = requested_resources.cloud - else: - controller_cloud = controller_resources.cloud + controller_cloud = (requested_resources.cloud + if controller_resources.cloud is None else + controller_resources.cloud) # TODO(tian): Probably run another sky.launch after we get the load # balancer port from the controller? So we don't need to open so many # ports here. Or, we should have a nginx traffic control to refuse @@ -1126,7 +1099,17 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: print(f'{colorama.Fore.YELLOW}Launching controller for ' f'{service_name!r}...{colorama.Style.RESET_ALL}') - _execute( + # We directly submit the request to the controller and let the + # controller to check name conflict. Suppose we have multiple + # sky.serve_up() with same service name, the first one will + # successfully write its job id to controller service database; + # and for all following sky.serve_up, the controller will throw + # an exception (name conflict detected) and exit. Therefore the + # controller job id in database could be use as an indicator of + # whether the service is already running. If the id is the same + # with the current job id, we know the service is up and running + # for the first time; otherwise it is a name conflict. + controller_job_id, controller_handle = _execute( entrypoint=controller_task, stream_logs=False, cluster_name=controller_name, @@ -1135,19 +1118,51 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) - # TODO(tian): Use this to check the endpoint and if the - # service name is duplicated. + + assert controller_job_id is not None and controller_handle is not None # TODO(tian): Cache endpoint locally to speedup. Endpoint won't # change after the first time, so there is no consistency issue. + with rich_utils.safe_status( + '[cyan]Waiting for the service to initialize[/]'): + # This function will check the controller job id in the database + # and return the endpoint if the job id matches. Otherwise it will + # return None. + code = serve.ServeCodeGen.wait_service_initialization( + service_name, controller_job_id) + backend = backend_utils.get_backend_from_handle(controller_handle) + assert isinstance(backend, backends.CloudVmRayBackend) + assert isinstance(controller_handle, + backends.CloudVmRayResourceHandle) + returncode, lb_port_payload, _ = backend.run_on_head( + controller_handle, + code, + require_outputs=True, + stream_logs=False) + subprocess_utils.handle_returncode( + returncode, code, 'Failed to wait for service initialization', + lb_port_payload) + lb_port = serve.load_service_initialization_result(lb_port_payload) + if lb_port is None: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'The service {service_name!r} is already running. ' + 'Please specify a different name for your service. ' + 'To update an existing service, run: `sky serve down` ' + 'and then `sky serve up` again (in-place update will ' + 'be supported in the future).') + endpoint = f'{controller_handle.head_ip}:{lb_port}' + sn = service_name style = colorama.Style fore = colorama.Fore sky_logging.print( f'{fore.CYAN}Service name: ' f'{style.BRIGHT}{sn}{style.RESET_ALL}' + f'\n{fore.CYAN}Endpoint URL: ' + f'{style.BRIGHT}{endpoint}{style.RESET_ALL}' '\nTo see detailed info:\t\t' - f'{backend_utils.BOLD}sky serve status {sn} (-a)' - f'{backend_utils.RESET_BOLD}' + f'{backend_utils.BOLD}sky serve status {sn} ' + f'[--endpoint]{backend_utils.RESET_BOLD}' '\nTo teardown the service:\t' f'{backend_utils.BOLD}sky serve down {sn}' f'{backend_utils.RESET_BOLD}' @@ -1166,45 +1181,10 @@ def _serve_up_no_lock(task: 'sky.Task', service_name: str) -> None: f'{backend_utils.BOLD}watch -n10 sky serve status {sn}' f'{backend_utils.RESET_BOLD}' '\nTo send a test request:\t\t' - f'{backend_utils.BOLD}curl -L $(sky serve status {sn} ' - f'--endpoint){backend_utils.RESET_BOLD}' + f'{backend_utils.BOLD}curl -L {endpoint}' + f'{backend_utils.RESET_BOLD}' '\n' f'\n{fore.GREEN}SkyServe is spinning up your service now.' f'{style.RESET_ALL}' - f'\n{fore.GREEN}The endpoint and replicas should be ready ' - f'within a short time.{style.RESET_ALL}') - - -@usage_lib.entrypoint -def serve_up( - task: 'sky.Task', - service_name: Optional[str] = None, -) -> None: - """Spin up a service. - - Please refer to the sky.cli.serve_up for the document. - - Args: - task: sky.Task to serve up. - service_name: Name of the service. - """ - if service_name is None: - service_name = serve.generate_service_name() - - # The service name will be used as: - # 1. controller cluster name: 'sky-serve-controller-' - # 2. replica cluster name: '-' - # In both cases, service name shares the same regex with cluster name. - if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service name {service_name!r} is invalid: ' - f'ensure it is fully matched by regex (e.g., ' - 'only contains lower letters, numbers and dash): ' - f'{constants.CLUSTER_NAME_VALID_REGEX}') - - # We need this lock to make sure no two sky.serve_up() with same service - # name are running at the same time. It is for the race condition that - # two of them are trying to create a record in controller services database - # but the controller is not up yet. - with filelock.FileLock(_SERVE_UP_NAME_LOCK_PATH.format(service_name)): - _serve_up_no_lock(task, service_name) + f'\n{fore.GREEN}The replicas should be ready within a ' + f'short time.{style.RESET_ALL}') diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index a9ce1e82fbe..e25d24a079e 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -12,11 +12,11 @@ from sky.serve.serve_state import ServiceStatus from sky.serve.serve_utils import generate_remote_config_yaml_file_name from sky.serve.serve_utils import generate_remote_controller_log_file_name -from sky.serve.serve_utils import generate_remote_task_yaml_file_name +from sky.serve.serve_utils import generate_remote_tmp_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name from sky.serve.serve_utils import generate_service_name -from sky.serve.serve_utils import load_add_service_result from sky.serve.serve_utils import load_serve_status +from sky.serve.serve_utils import load_service_initialization_result from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent from sky.serve.serve_utils import SKY_SERVE_CONTROLLER_NAME diff --git a/sky/serve/constants.py b/sky/serve/constants.py index ecb3ff852a5..d99d7a95227 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -12,6 +12,9 @@ # Signal file path for controller to handle signals. SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}' +# Time to wait in seconds for service to initialize. +INITIALIZATION_TIMEOUT_SECONDS = 60 + # The time interval in seconds for load balancer to sync with controller. Every # time the load balancer syncs with controller, it will update all available # replica ips for each service, also send the number of requests in last query diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index d6006e534a3..ef586cc4d00 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -576,7 +576,7 @@ def scale_down(self, replica_ids: List[int]) -> None: for replica_id in replica_ids: self._terminate_replica(replica_id, sync_down_logs=False) - def _recover_from_preemption(self, replica_id: int) -> None: + def _handle_preemption(self, replica_id: int) -> None: logger.info(f'Beginning recovery for preempted replica {replica_id}.') # TODO(MaoZiming): Support spot recovery policies info = serve_state.get_replica_info_from_id(self._service_name, @@ -824,7 +824,7 @@ def _probe_all_replicas(self) -> None: f' (status: {cluster_status.value})') logger.info(f'Replica {info.replica_id} ' f'is preempted{cluster_status_str}.') - self._recover_from_preemption(info.replica_id) + self._handle_preemption(info.replica_id) continue diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index 376d90c0e39..aad8f764168 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -181,14 +181,24 @@ def from_replica_statuses( } -# === Service functions === -def add_service_if_not_exist(name: str) -> bool: - """Adds a service to the database.""" +def add_service(name: str, controller_job_id: int, policy: str, + auto_restart: bool, requested_resources: 'sky.Resources', + status: ServiceStatus) -> bool: + """Add a service in the database. + + Returns: + True if the service is added successfully, False if the service already + exists. + """ try: _DB.cursor.execute( """\ - INSERT INTO services (name, status) - VALUES (?, ?)""", (name, ServiceStatus.CONTROLLER_INIT.value)) + INSERT INTO services + (name, controller_job_id, status, policy, + auto_restart, requested_resources) + VALUES (?, ?, ?, ?, ?, ?)""", + (name, controller_job_id, status.value, policy, int(auto_restart), + pickle.dumps(requested_resources))) _DB.conn.commit() except sqlite3.IntegrityError as e: if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG: @@ -197,22 +207,6 @@ def add_service_if_not_exist(name: str) -> bool: return True -def add_or_update_service(name: str, controller_job_id: int, policy: str, - auto_restart: bool, - requested_resources: 'sky.Resources', - status: ServiceStatus) -> None: - """Updates a service in the database.""" - _DB.cursor.execute( - """\ - INSERT OR REPLACE INTO services - (name, controller_job_id, status, policy, - auto_restart, requested_resources) - VALUES (?, ?, ?, ?, ?, ?)""", - (name, controller_job_id, status.value, policy, int(auto_restart), - pickle.dumps(requested_resources))) - _DB.conn.commit() - - def remove_service(service_name: str) -> None: """Removes a service from the database.""" _DB.cursor.execute("""\ diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index cd167f9080a..532db48e0ba 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -166,9 +166,15 @@ def generate_remote_service_dir_name(service_name: str) -> str: return os.path.join(constants.SKYSERVE_METADATA_DIR, service_name) -def generate_remote_task_yaml_file_name(service_name: str) -> str: +def generate_remote_tmp_task_yaml_file_name(service_name: str) -> str: dir_name = generate_remote_service_dir_name(service_name) # Don't expand here since it is used for remote machine. + return os.path.join(dir_name, 'task.yaml.tmp') + + +def generate_task_yaml_file_name(service_name: str) -> str: + dir_name = generate_remote_service_dir_name(service_name) + dir_name = os.path.expanduser(dir_name) return os.path.join(dir_name, 'task.yaml') @@ -253,15 +259,6 @@ def update_service_status() -> None: record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED) -def add_service_if_not_exist(service_name: str) -> str: - return common_utils.encode_payload( - serve_state.add_service_if_not_exist(service_name)) - - -def load_add_service_result(payload: str) -> bool: - return common_utils.decode_payload(payload) - - def _get_serve_status( service_name: str, with_replica_info: bool = True) -> Optional[Dict[str, Any]]: @@ -345,6 +342,36 @@ def terminate_services(service_names: Optional[List[str]]) -> str: return f'{identity_str} scheduled to be terminated.' +def wait_service_initialization(service_name: str, job_id: int) -> str: + """Util function to call at the end of `sky.serve_up()`. + + This function will: + (1) Check the name duplication by job id of the controller. If + the job id is not the same as the database record, this + means another service is already taken that name. See + sky/execution.py::serve_up for more details. + (2) Wait for the load balancer port to be assigned and return. + """ + cnt = 0 + while True: + record = serve_state.get_service_from_name(service_name) + if record is None: + continue + if job_id != record['controller_job_id']: + return common_utils.encode_payload(None) + lb_port = record['load_balancer_port'] + if lb_port is not None: + return common_utils.encode_payload(lb_port) + time.sleep(1) + cnt += 1 + if cnt > constants.INITIALIZATION_TIMEOUT_SECONDS: + raise ValueError(f'Failed to initialize service {service_name!r}.') + + +def load_service_initialization_result(payload: str) -> Optional[int]: + return common_utils.decode_payload(payload) + + def check_service_status_healthy(service_name: str) -> Optional[str]: service_record = serve_state.get_service_from_name(service_name) if service_record is None: @@ -542,26 +569,26 @@ class ServeCodeGen: ] @classmethod - def add_service_if_not_exist(cls, service_name: str) -> str: + def get_serve_status(cls, service_names: Optional[List[str]]) -> str: code = [ - f'msg = serve_utils.add_service_if_not_exist({service_name!r})', + f'msg = serve_utils.get_serve_status_encoded({service_names!r})', 'print(msg, end="", flush=True)' ] return cls._build(code) @classmethod - def get_serve_status(cls, service_names: Optional[List[str]]) -> str: + def terminate_services(cls, service_names: Optional[List[str]]) -> str: code = [ - f'msg = serve_utils.get_serve_status_encoded({service_names!r})', + f'msg = serve_utils.terminate_services({service_names!r})', 'print(msg, end="", flush=True)' ] return cls._build(code) @classmethod - def terminate_services(cls, service_names: Optional[List[str]]) -> str: + def wait_service_initialization(cls, service_name: str, job_id: int) -> str: code = [ - f'msg = serve_utils.terminate_services({service_names!r})', - 'print(msg, end="", flush=True)' + 'msg = serve_utils.wait_service_initialization(' + f'{service_name!r}, {job_id})', 'print(msg, end="", flush=True)' ] return cls._build(code) diff --git a/sky/serve/service.py b/sky/serve/service.py index 8e605e57f30..b67493b16b9 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -63,6 +63,20 @@ def _handle_signal(service_name: str) -> None: raise error_type(f'User signal received: {user_signal.value}') +def _cleanup_storage(task_yaml: str) -> bool: + try: + task = task_lib.Task.from_yaml(task_yaml) + backend = cloud_vm_ray_backend.CloudVmRayBackend() + backend.teardown_ephemeral_storage(task) + except Exception as e: # pylint: disable=broad-except + logger.error('Failed to clean up storage: ' + f'{common_utils.format_exception(e)}') + with ux_utils.enable_traceback(): + logger.error(f' Traceback: {traceback.format_exc()}') + return True + return False + + def _cleanup(service_name: str, task_yaml: str) -> bool: """Clean up the sky serve replicas, storage, and service record.""" failed = False @@ -94,37 +108,19 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: info) failed = True logger.error(f'Replica {info.replica_id} failed to terminate.') - try: - task = task_lib.Task.from_yaml(task_yaml) - backend = cloud_vm_ray_backend.CloudVmRayBackend() - backend.teardown_ephemeral_storage(task) - except Exception as e: # pylint: disable=broad-except - logger.error('Failed to clean up storage: ' - f'{common_utils.format_exception(e)}') - with ux_utils.enable_traceback(): - logger.error(f' Traceback: {traceback.format_exc()}') - failed = True + failed = failed or _cleanup_storage(task_yaml) return failed -def _start(service_name: str, task_yaml: str, job_id: int): +def _start(service_name: str, tmp_task_yaml: str, job_id: int): """Starts the service.""" - # Generate log file name. - load_balancer_log_file = os.path.expanduser( - serve_utils.generate_remote_load_balancer_log_file_name(service_name)) - - # Create the service working directory. - service_dir = os.path.expanduser( - serve_utils.generate_remote_service_dir_name(service_name)) - os.makedirs(service_dir, exist_ok=True) - # Generate ssh key pair to avoid race condition when multiple sky.launch # are executed at the same time. authentication.get_or_generate_keys() # Initialize database record for the service. - service_spec = serve.SkyServiceSpec.from_yaml(task_yaml) - with open(task_yaml, 'r') as f: + service_spec = serve.SkyServiceSpec.from_yaml(tmp_task_yaml) + with open(tmp_task_yaml, 'r') as f: config = yaml.safe_load(f) resources_config = None if isinstance(config, dict): @@ -135,16 +131,36 @@ def _start(service_name: str, task_yaml: str, job_id: int): # TODO(tian): Probably we should raise an error and not pending here. # This busy loop is also a ray job and will take a lot of memory. status = serve_state.ServiceStatus.PENDING - # Here, the service record might already registered in the database if the - # controller is UP, but also might not if the controller is STOPPED or not - # created yet before this service. So we use add_or_update_service here. - # See sky.execution._register_service_name for more details. - serve_state.add_or_update_service(service_name, + success = serve_state.add_service(service_name, controller_job_id=job_id, policy=service_spec.policy_str(), auto_restart=service_spec.auto_restart, requested_resources=requested_resources, status=status) + # Directly throw an error here. See sky/execution.py::serve_up + # for more details. + if not success: + _cleanup_storage(tmp_task_yaml) + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service {service_name} already exists.') + + # Create the service working directory. + service_dir = os.path.expanduser( + serve_utils.generate_remote_service_dir_name(service_name)) + os.makedirs(service_dir, exist_ok=True) + + # Copy the tmp task yaml file to the final task yaml file. + # This is for the service name conflict case. The _execute will + # sync file mounts first and then realized a name conflict. We + # don't want the new file mounts to overwrite the old one, so we + # sync to a tmp file first and then copy it to the final name + # if there is no name conflict. + task_yaml = serve_utils.generate_task_yaml_file_name(service_name) + shutil.copy(tmp_task_yaml, task_yaml) + + # Generate load balancer log file name. + load_balancer_log_file = os.path.expanduser( + serve_utils.generate_remote_load_balancer_log_file_name(service_name)) controller_process = None load_balancer_process = None From 4ff9975161017c42aefd9a6cd17301d0b6f06894 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 10 Nov 2023 21:46:44 -0800 Subject: [PATCH 199/223] apply suggestion from code review --- sky/serve/controller.py | 2 ++ sky/serve/replica_managers.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/sky/serve/controller.py b/sky/serve/controller.py index 95bb97105d9..4f08c054731 100644 --- a/sky/serve/controller.py +++ b/sky/serve/controller.py @@ -113,6 +113,8 @@ def configure_logger(): uvicorn.run(self._app, host='localhost', port=self._port) +# TODO(tian): Probably we should support service that will stop the VM in +# specific time period. def run_controller(service_name: str, service_spec: serve.SkyServiceSpec, task_yaml: str, controller_port: int): controller = SkyServeController(service_name, service_spec, task_yaml, diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index ef586cc4d00..11351cbdc72 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -235,7 +235,7 @@ def is_scale_down_succeeded(self, initial_delay_seconds: int, return False return self.first_ready_time is not None - def should_track_status(self) -> bool: + def should_track_service_status(self) -> bool: """Should we track the status of the replica. This includes: @@ -381,6 +381,9 @@ def probe( Tuple of (self, is_ready, probe_time). """ replica_identity = f'replica {self.replica_id} with url {self.url}' + # # TODO(tian): This requiring the clock on each replica to be aligned, + # which may not be true when the GCP VMs have run for a long time. We + # should have a better way to do this. See #2539 for more information. probe_time = time.time() try: msg = '' @@ -459,6 +462,12 @@ def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec', task_yaml_path: str) -> None: super().__init__(service_name, spec) self._task_yaml_path = task_yaml_path + # TODO(tian): Store launch/down pid in the replica table, to make the + # manager more persistent. Current blocker is that we need to manually + # poll the Process (by join or is_alive), otherwise, it will never + # finish and become a zombie process. Probably we could use + # psutil.Process(p.pid).status() == psutil.STATUS_ZOMBIE to check + # such cases. self._launch_process_pool: serve_utils.ThreadSafeDict[ int, multiprocessing.Process] = serve_utils.ThreadSafeDict() self._down_process_pool: serve_utils.ThreadSafeDict[ @@ -714,7 +723,7 @@ def _fetch_job_status(self) -> None: """ infos = serve_state.get_replica_infos(self._service_name) for info in infos: - if not info.status_property.should_track_status(): + if not info.status_property.should_track_service_status(): continue # We use backend API to avoid usage collection in the # core.job_status. @@ -767,7 +776,7 @@ def _probe_all_replicas(self) -> None: with mp_pool.ThreadPool() as pool: infos = serve_state.get_replica_infos(self._service_name) for info in infos: - if not info.status_property.should_track_status(): + if not info.status_property.should_track_service_status(): continue replica_to_probe.append( f'replica_{info.replica_id}(url={info.url})') From 7cff5432195b591b89e6f04d5da38522e6fc9603 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 11 Nov 2023 00:27:37 -0800 Subject: [PATCH 200/223] fix jinja2 var --- sky/execution.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/execution.py b/sky/execution.py index 0e5e8f54bd6..0e58b72ee94 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -718,6 +718,10 @@ def _controller_skypilot_config_setup( if custom_controller_resources_config is not None: controller_resources_config_copied.update( custom_controller_resources_config) + else: + # If the user config is not loaded, manually set this to None + # so that the template won't render this. + vars_to_fill['user_config_path'] = None try: controller_resources = sky.Resources.from_yaml_config( controller_resources_config_copied) @@ -791,7 +795,6 @@ def spot_launch( vars_to_fill = { 'remote_user_yaml_prefix': spot.SPOT_TASK_YAML_PREFIX, 'user_yaml_path': f.name, - 'user_config_path': None, 'spot_controller': controller_name, # Note: actual spot cluster name will be - 'dag_name': dag.name, From aff82507366e341e624bc906e240e569a2843faa Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 11 Nov 2023 01:36:52 -0800 Subject: [PATCH 201/223] rename reserved cluster, move controller_utils function back --- sky/backends/backend_utils.py | 193 ++++++++++++++++++++++++ sky/backends/cloud_vm_ray_backend.py | 7 +- sky/backends/onprem_utils.py | 4 +- sky/cli.py | 119 +++++++-------- sky/core.py | 50 +++---- sky/data/storage_utils.py | 47 +++++- sky/execution.py | 13 +- sky/serve/serve_utils.py | 15 +- sky/serve/service.py | 3 +- sky/utils/cli_utils/status_utils.py | 51 +------ sky/utils/controller_utils.py | 211 +-------------------------- sky/utils/schemas.py | 48 +++--- tests/test_jobs.py | 2 +- tests/test_spot.py | 11 +- 14 files changed, 376 insertions(+), 398 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index e781df24ca8..03b17590b15 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,4 +1,5 @@ """Util constants/functions for the backends.""" +import dataclasses from datetime import datetime import enum import getpass @@ -38,6 +39,7 @@ from sky import serve as serve_lib from sky import sky_logging from sky import skypilot_config +from sky import spot as spot_lib from sky import status_lib from sky.backends import onprem_utils from sky.provision import instance_setup @@ -151,6 +153,92 @@ ] +# TODO(tian): Refactor to controller_utils. Current blocker: circular import. +@dataclasses.dataclass +class _ControllerSpec: + """Spec for skypilot controllers.""" + name: str + cluster_name: str + in_progress_hint: str + decline_cancel_hint: str + decline_down_in_init_status_hint: str + decline_down_for_dirty_controller_hint: str + check_cluster_name_hint: str + default_hint_if_non_existent: str + + +class Controllers(enum.Enum): + """Skypilot controllers.""" + # NOTE(dev): Keep this align with + # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE + SPOT_CONTROLLER = _ControllerSpec( + name='managed spot controller', + cluster_name=spot_lib.SPOT_CONTROLLER_NAME, + in_progress_hint=( + '* {job_info}To see all spot jobs: ' + f'{colorama.Style.BRIGHT}sky spot queue{colorama.Style.RESET_ALL}'), + decline_cancel_hint=( + 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' + f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel [--all]{colorama.Style.RESET_ALL}'), + decline_down_in_init_status_hint=( + f'{colorama.Fore.RED}Tearing down the spot controller while ' + 'it is in INIT state is not supported (this means a spot launch ' + 'is in progress or the previous launch failed), as we cannot ' + 'guarantee that all the spot jobs are finished. Please wait ' + 'until the spot controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), + decline_down_for_dirty_controller_hint=( + f'{colorama.Fore.RED}In-progress spot jobs found. To avoid ' + f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}' + f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), + check_cluster_name_hint=( + f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' + 'managed spot controller. '), + default_hint_if_non_existent='No managed spot jobs are found.') + SKY_SERVE_CONTROLLER = _ControllerSpec( + name='sky serve controller', + cluster_name=serve_lib.SKY_SERVE_CONTROLLER_NAME, + in_progress_hint=( + f'* To see detailed service status: {colorama.Style.BRIGHT}' + f'sky serve status -a{colorama.Style.RESET_ALL}'), + decline_cancel_hint=( + 'Cancelling the sky serve controller\'s jobs is not allowed.'), + decline_down_in_init_status_hint=( + f'{colorama.Fore.RED}Tearing down the sky serve controller ' + 'while it is in INIT state is not supported (this means a sky ' + 'serve up is in progress or the previous launch failed), as we ' + 'cannot guarantee that all the services are terminated. Please ' + 'wait until the sky serve controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{serve_lib.SKY_SERVE_CONTROLLER_NAME}' + f'{colorama.Style.RESET_ALL}.'), + decline_down_for_dirty_controller_hint=( + f'{colorama.Fore.RED}Tearing down the sky serve controller is not ' + 'supported, as it is currently serving the following services: ' + '{service_names}. Please terminate the services first with ' + f'{colorama.Style.BRIGHT}sky serve down -a' + f'{colorama.Style.RESET_ALL}.'), + check_cluster_name_hint=( + f'Cluster {serve_lib.SKY_SERVE_CONTROLLER_NAME} is reserved for ' + 'sky serve controller. '), + default_hint_if_non_existent='No service is found.') + + @classmethod + def from_name(cls, name: Optional[str]) -> Optional['Controllers']: + """Check if the cluster name is a controller name. + + Returns: + The controller if the cluster name is a controller name. + Otherwise, returns None. + """ + for controller in cls: + if controller.value.cluster_name == name: + return controller + return None + + def is_ip(s: str) -> bool: """Returns whether this string matches IP_ADDR_REGEX.""" return len(re.findall(IP_ADDR_REGEX, s)) == 1 @@ -2499,6 +2587,79 @@ def check_cluster_available( return handle +# TODO(tian): Refactor to controller_utils. Current blocker: circular import. +def is_controller_up( + controller_type: Controllers, + stopped_message: str, + non_existent_message: Optional[str] = None, +) -> Tuple[Optional[status_lib.ClusterStatus], + Optional['backends.CloudVmRayResourceHandle']]: + """Check if the spot/serve controller is up. + + It can be used to check the actual controller status (since the autostop is + set for the controller) before the spot/serve commands interact with the + controller. + + Args: + type: Type of the controller. + stopped_message: Message to print if the controller is STOPPED. + non_existent_message: Message to show if the controller does not exist. + + Returns: + controller_status: The status of the controller. If it fails during + refreshing the status, it will be the cached status. None if the + controller does not exist. + handle: The ResourceHandle of the controller. None if the + controller is not UP or does not exist. + + Raises: + exceptions.ClusterOwnerIdentityMismatchError: if the current user is not + the same as the user who created the cluster. + exceptions.CloudUserIdentityError: if we fail to get the current user + identity. + """ + if non_existent_message is None: + non_existent_message = ( + controller_type.value.default_hint_if_non_existent) + cluster_name = controller_type.value.cluster_name + controller_name = controller_type.value.name.replace(' controller', '') + try: + # Set force_refresh_statuses=None to make sure the refresh only happens + # when the controller is INIT/UP (triggered in these statuses as the + # autostop is always set for the controller). This optimization avoids + # unnecessary costly refresh when the controller is already stopped. + # This optimization is based on the assumption that the user will not + # start the controller manually from the cloud console. + controller_status, handle = refresh_cluster_status_handle( + cluster_name, force_refresh_statuses=None) + except exceptions.ClusterStatusFetchingError as e: + # We do not catch the exceptions related to the cluster owner identity + # mismatch, please refer to the comment in + # `backend_utils.check_cluster_available`. + logger.warning( + 'Failed to get the status of the controller. It is not ' + f'fatal, but {controller_name} commands/calls may hang or return ' + 'stale information, when the controller is not up.\n' + f' Details: {common_utils.format_exception(e, use_bracket=True)}') + record = global_user_state.get_cluster_from_name(cluster_name) + controller_status, handle = None, None + if record is not None: + controller_status, handle = record['status'], record['handle'] + + if controller_status is None: + sky_logging.print(non_existent_message) + elif controller_status != status_lib.ClusterStatus.UP: + msg = (f'{controller_name.capitalize()} controller {cluster_name} ' + f'is {controller_status.value}.') + if controller_status == status_lib.ClusterStatus.STOPPED: + msg += f'\n{stopped_message}' + if controller_status == status_lib.ClusterStatus.INIT: + msg += '\nPlease wait for the controller to be ready.' + sky_logging.print(msg) + handle = None + return controller_status, handle + + class CloudFilter(enum.Enum): # Filter for all types of clouds. ALL = 'all' @@ -2509,6 +2670,7 @@ class CloudFilter(enum.Enum): def get_clusters( + include_controller: bool, refresh: bool, cloud_filter: CloudFilter = CloudFilter.CLOUDS_AND_DOCKER, cluster_names: Optional[Union[str, List[str]]] = None, @@ -2521,6 +2683,8 @@ def get_clusters( of the clusters. Args: + include_controller: Whether to include controllers, e.g. spot controller + or sky serve controller. refresh: Whether to refresh the status of the clusters. (Refreshing will set the status to STOPPED if the cluster cannot be pinged.) cloud_filter: Sets which clouds to filer through from the global user @@ -2535,6 +2699,12 @@ def get_clusters( """ records = global_user_state.get_clusters() + if not include_controller: + records = [ + record for record in records + if Controllers.from_name(record['name']) is None + ] + yellow = colorama.Fore.YELLOW bright = colorama.Style.BRIGHT reset = colorama.Style.RESET_ALL @@ -2721,6 +2891,29 @@ def get_task_resources_str(task: 'task_lib.Task') -> str: return resources_str +# TODO(tian): Refactor to controller_utils. Current blocker: circular import. +def check_cluster_name_not_controller( + cluster_name: Optional[str], + operation_str: Optional[str] = None) -> None: + """Errors out if the cluster name is a controller name. + + Raises: + sky.exceptions.NotSupportedError: if the cluster name is a controller + name, raise with an error message explaining 'operation_str' is not + allowed. + + Returns: + None, if the cluster name is not a controller name. + """ + controller = Controllers.from_name(cluster_name) + if controller is not None: + msg = controller.value.check_cluster_name_hint + if operation_str is not None: + msg += f' {operation_str} is not allowed.' + with ux_utils.print_exception_no_traceback(): + raise exceptions.NotSupportedError(msg) + + # Handle ctrl-c def interrupt_handler(signum, frame): del signum, frame diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index f9e78b1ed8e..422b9230630 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -52,7 +52,6 @@ from sky.usage import usage_lib from sky.utils import command_runner from sky.utils import common_utils -from sky.utils import controller_utils from sky.utils import log_utils from sky.utils import resources_utils from sky.utils import rich_utils @@ -3370,8 +3369,8 @@ def _exec_code_on_head( self.tail_logs(handle, job_id) finally: name = handle.cluster_name - controller = controller_utils.Controllers.check_cluster_name(name) - if controller == controller_utils.Controllers.SPOT_CONTROLLER: + controller = backend_utils.Controllers.from_name(name) + if controller == backend_utils.Controllers.SPOT_CONTROLLER: logger.info( f'{fore.CYAN}Spot Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' @@ -3514,7 +3513,7 @@ def _post_execute(self, handle: CloudVmRayResourceHandle, fore = colorama.Fore style = colorama.Style name = handle.cluster_name - controller = controller_utils.Controllers.check_cluster_name(name) + controller = backend_utils.Controllers.from_name(name) if controller is not None or down: return stop_str = ('\nTo stop the cluster:' diff --git a/sky/backends/onprem_utils.py b/sky/backends/onprem_utils.py index a6c84c4df91..d69a2d8565d 100644 --- a/sky/backends/onprem_utils.py +++ b/sky/backends/onprem_utils.py @@ -105,7 +105,9 @@ def check_and_get_local_clusters(suppress_error: bool = False) -> List[str]: # Remove clusters that are in global user state but are not in # ~/.sky/local. records = backend_utils.get_clusters( - refresh=False, cloud_filter=backend_utils.CloudFilter.LOCAL) + include_controller=False, + refresh=False, + cloud_filter=backend_utils.CloudFilter.LOCAL) saved_clusters = [r['name'] for r in records] for cluster_name in saved_clusters: if cluster_name not in local_cluster_names: diff --git a/sky/cli.py b/sky/cli.py index b803fd65b0d..62a3afee399 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -60,12 +60,12 @@ from sky.benchmark import benchmark_state from sky.benchmark import benchmark_utils from sky.clouds import service_catalog +from sky.data import storage_utils from sky.skylet import constants from sky.skylet import job_lib from sky.usage import usage_lib from sky.utils import command_runner from sky.utils import common_utils -from sky.utils import controller_utils from sky.utils import dag_utils from sky.utils import env_options from sky.utils import kubernetes_utils @@ -1398,7 +1398,7 @@ def launch( """ # NOTE(dev): Keep the docstring consistent between the Python API and CLI. env = _merge_env_vars(env_file, env) - controller_utils.check_cluster_name_not_reserved( + backend_utils.check_cluster_name_not_controller( cluster, operation_str='Launching tasks on it') if backend_name is None: backend_name = backends.CloudVmRayBackend.NAME @@ -1568,7 +1568,7 @@ def exec( raise ValueError('`ports` is not supported by `sky exec`.') env = _merge_env_vars(env_file, env) - controller_utils.check_cluster_name_not_reserved( + backend_utils.check_cluster_name_not_controller( cluster, operation_str='Executing task on it') handle = global_user_state.get_handle_from_cluster_name(cluster) if handle is None: @@ -1907,22 +1907,21 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, click.echo(head_ip) return hints = [] - nonreserved_cluster_records = [] - reserved_clusters = [] + normal_clusters = [] + controllers = [] for cluster_record in cluster_records: cluster_name = cluster_record['name'] - controller = controller_utils.Controllers.check_cluster_name( - cluster_name) + controller = backend_utils.Controllers.from_name(cluster_name) if controller is not None: - reserved_clusters.append(cluster_record) + controllers.append(cluster_record) else: - nonreserved_cluster_records.append(cluster_record) + normal_clusters.append(cluster_record) local_clusters = onprem_utils.check_and_get_local_clusters( suppress_error=True) num_pending_autostop = 0 num_pending_autostop += status_utils.show_status_table( - nonreserved_cluster_records + reserved_clusters, all) + normal_clusters + controllers, all) status_utils.show_local_status_table(local_clusters) def _try_get_future_result(future) -> Tuple[bool, Any]: @@ -1968,7 +1967,7 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: 'shown)') job_info += '. ' hints.append( - controller_utils.Controllers.SPOT_CONTROLLER.value. + backend_utils.Controllers.SPOT_CONTROLLER.value. in_progress_hint.format(job_info=job_info)) if show_services: @@ -1988,7 +1987,7 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: num_services, msg = result click.echo(msg) if num_services is not None: - hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER. + hints.append(backend_utils.Controllers.SKY_SERVE_CONTROLLER. value.in_progress_hint) if show_spot_jobs or show_services: @@ -2046,25 +2045,24 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin """ cluster_records = core.cost_report() - nonreserved_cluster_records = [] + normal_cluster_records = [] controllers = dict() for cluster_record in cluster_records: cluster_name = cluster_record['name'] - controller = controller_utils.Controllers.check_cluster_name( - cluster_name) + controller = backend_utils.Controllers.from_name(cluster_name) if controller is not None: controller_name = controller.value.name - # to display most recent entry for each reserved cluster + # to display most recent entry for each controller cluster # TODO(sgurram): fix assumption of sorted order of clusters if controller_name not in controllers: controllers[controller_name] = cluster_record else: - nonreserved_cluster_records.append(cluster_record) + normal_cluster_records.append(cluster_record) total_cost = status_utils.get_total_cost_of_displayed_records( - nonreserved_cluster_records, all) + normal_cluster_records, all) - status_utils.show_cost_report_table(nonreserved_cluster_records, all) + status_utils.show_cost_report_table(normal_cluster_records, all) for controller_name, cluster_record in controllers.items(): status_utils.show_cost_report_table( [cluster_record], all, controller_name=controller_name.capitalize()) @@ -2321,7 +2319,7 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) except exceptions.NotSupportedError: - controller = controller_utils.Controllers.check_cluster_name(cluster) + controller = backend_utils.Controllers.from_name(cluster) assert controller is not None, cluster click.echo(controller.value.decline_cancel_hint) sys.exit(1) @@ -2611,12 +2609,11 @@ def start( click.echo('Both --all and cluster(s) specified for sky start. ' 'Letting --all take effect.') - # Get all clusters that are not reserved names. + # Get all clusters that are not controllers. clusters = [ cluster['name'] for cluster in global_user_state.get_clusters() - if controller_utils.Controllers.check_cluster_name(cluster['name']) - is None + if backend_utils.Controllers.from_name(cluster['name']) is None ] if not clusters: @@ -2684,7 +2681,7 @@ def start( # Checks for controller clusters (spot controller / sky serve controller). controllers, normal_clusters = [], [] for name in to_start: - if controller_utils.Controllers.check_cluster_name(name) is not None: + if backend_utils.Controllers.from_name(name) is not None: controllers.append(name) else: normal_clusters.append(name) @@ -2806,8 +2803,7 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): click.echo('Managed spot controller has already been torn down.') return - controller = controller_utils.Controllers.check_cluster_name( - controller_name) + controller = backend_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): @@ -2859,8 +2855,7 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): click.echo('Sky serve controller has already been torn down.') return - controller = controller_utils.Controllers.check_cluster_name( - controller_name) + controller = backend_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): @@ -2886,9 +2881,9 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): _CONTROLLER_TO_HINT_OR_RAISE = { - controller_utils.Controllers.SPOT_CONTROLLER: + backend_utils.Controllers.SPOT_CONTROLLER: (_hint_or_raise_for_down_spot_controller), - controller_utils.Controllers.SKY_SERVE_CONTROLLER: + backend_utils.Controllers.SKY_SERVE_CONTROLLER: (_hint_or_raise_for_down_sky_serve_controller), } @@ -2902,7 +2897,7 @@ def _down_or_stop_clusters( idle_minutes_to_autostop: Optional[int] = None) -> None: """Tears down or (auto-)stops a cluster (or all clusters). - Reserved clusters (spot controller and sky serve controller) can only be + Controllers (spot controller and sky serve controller) can only be terminated if the cluster name is explicitly and uniquely specified (not via glob) and purge is set to True. """ @@ -2934,14 +2929,14 @@ def _down_or_stop_clusters( operation = f'{verb} auto{option_str} on' if len(names) > 0: - reserved_clusters = [ + controllers = [ name for name in names - if controller_utils.Controllers.check_cluster_name(name) is not None + if backend_utils.Controllers.from_name(name) is not None ] - reserved_clusters_str = ', '.join(map(repr, reserved_clusters)) + controllers_str = ', '.join(map(repr, controllers)) names = [ name for name in _get_glob_clusters(names) - if controller_utils.Controllers.check_cluster_name(name) is None + if backend_utils.Controllers.from_name(name) is None ] if not down: local_clusters = onprem_utils.check_and_get_local_clusters() @@ -2954,32 +2949,32 @@ def _down_or_stop_clusters( f'Skipping local cluster {c}, as it does not support ' '`sky stop/autostop`.')) ] - # Make sure the reserved clusters are explicitly specified without other + # Make sure the controllers are explicitly specified without other # normal clusters. - if reserved_clusters: + if controllers: if len(names) != 0: names_str = ', '.join(map(repr, names)) raise click.UsageError( - f'{operation} reserved cluster(s) ' - f'{reserved_clusters_str} with other cluster(s) ' + f'{operation} controller(s) ' + f'{controllers_str} with other cluster(s) ' f'{names_str} is currently not supported.\n' - f'Please omit the reserved cluster(s) {reserved_clusters}.') - if len(reserved_clusters) > 1: + f'Please omit the controller(s) {controllers}.') + if len(controllers) > 1: raise click.UsageError( - f'{operation} multiple reserved clusters ' - f'{reserved_clusters_str} is currently not supported.\n' - f'Please specify only one reserved cluster.') - reserved_cluster = reserved_clusters[0] + f'{operation} multiple controllers ' + f'{controllers_str} is currently not supported.\n' + f'Please specify only one controller.') + controller_name = controllers[0] if not down: raise click.UsageError( - f'{operation} reserved cluster(s) ' - f'{reserved_clusters_str} is currently not supported.') + f'{operation} controller(s) ' + f'{controllers_str} is currently not supported.') else: - controller = controller_utils.Controllers.check_cluster_name( - reserved_cluster) + controller = backend_utils.Controllers.from_name( + controller_name) assert controller is not None hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] - hint_or_raise(reserved_cluster) + hint_or_raise(controller_name) confirm_str = 'delete' user_input = click.prompt( f'To proceed, please type {colorama.Style.BRIGHT}' @@ -2988,7 +2983,7 @@ def _down_or_stop_clusters( if user_input != confirm_str: raise click.Abort() no_confirm = True - names += reserved_clusters + names += controllers if apply_to_all: all_clusters = global_user_state.get_clusters() @@ -2996,14 +2991,12 @@ def _down_or_stop_clusters( click.echo( f'Both --all and cluster(s) specified for `sky {command}`. ' 'Letting --all take effect.') - # We should not remove reserved clusters when --all is specified. - # Otherwise, it would be very easy to accidentally delete a reserved - # cluster. + # We should not remove controllers when --all is specified. + # Otherwise, it would be very easy to accidentally delete a controller. names = [ record['name'] for record in all_clusters - if controller_utils.Controllers.check_cluster_name(record['name']) - is None + if backend_utils.Controllers.from_name(record['name']) is None ] clusters = [] @@ -3615,7 +3608,7 @@ def storage(): def storage_ls(all: bool): """List storage objects managed by SkyPilot.""" storages = sky.storage_ls() - storage_table = status_utils.format_storage_table(storages, show_all=all) + storage_table = storage_utils.format_storage_table(storages, show_all=all) click.echo(storage_table) @@ -4071,8 +4064,8 @@ def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): # Cancel managed spot jobs with IDs 1, 2, 3 $ sky spot cancel 1 2 3 """ - _, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SPOT_CONTROLLER, + _, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, stopped_message='All managed spot jobs should have finished.') if handle is None: # Hint messages already printed by the call above. @@ -4156,8 +4149,8 @@ def spot_dashboard(port: Optional[int]): hint = ( 'Dashboard is not available if spot controller is not up. Run a spot ' 'job first.') - _, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SPOT_CONTROLLER, + _, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, stopped_message=hint, non_existent_message=hint) if handle is None: @@ -4452,8 +4445,8 @@ def serve_down(service_names: List[str], all: bool, yes: bool): 'Can only specify one of SERVICE_NAMES or --all. ' f'Provided {argument_str!r}.') - _, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, + _, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='All services should have been terminated.') if handle is None: # Hint messages already printed by the call above. diff --git a/sky/core.py b/sky/core.py index 9cdd2305af8..a860661ca39 100644 --- a/sky/core.py +++ b/sky/core.py @@ -106,7 +106,8 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None, cluster. If a cluster is found to be terminated or not found, it will be omitted from the returned list. """ - return backend_utils.get_clusters(refresh=refresh, + return backend_utils.get_clusters(include_controller=False, + refresh=refresh, cluster_names=cluster_names) @@ -183,8 +184,7 @@ def _start( f'Starting cluster {cluster_name!r} with backend {backend.NAME} ' 'is not supported.') - if controller_utils.Controllers.check_cluster_name( - cluster_name) is not None: + if backend_utils.Controllers.from_name(cluster_name) is not None: if down: raise ValueError('Using autodown (rather than autostop) is not ' 'supported for SkyPilot controllers. Pass ' @@ -306,10 +306,9 @@ def stop(cluster_name: str, purge: bool = False) -> None: sky.exceptions.NotSupportedError: if the specified cluster is a spot cluster, or a TPU VM Pod cluster, or the managed spot controller. """ - if controller_utils.Controllers.check_cluster_name( - cluster_name) is not None: + if backend_utils.Controllers.from_name(cluster_name) is not None: raise exceptions.NotSupportedError( - f'Stopping sky reserved cluster {cluster_name!r} ' + f'Stopping SkyPilot controller {cluster_name!r} ' f'is not supported.') handle = global_user_state.get_handle_from_cluster_name(cluster_name) if handle is None: @@ -430,10 +429,9 @@ def autostop( if is_cancel: option_str = '{stop,down}' operation = f'{verb} auto{option_str}' - if controller_utils.Controllers.check_cluster_name( - cluster_name) is not None: + if backend_utils.Controllers.from_name(cluster_name) is not None: raise exceptions.NotSupportedError( - f'{operation} sky reserved cluster {cluster_name!r} ' + f'{operation} SkyPilot controller {cluster_name!r} ' f'is not supported.') handle = backend_utils.check_cluster_available( cluster_name, @@ -560,13 +558,13 @@ def cancel( ValueError: if arguments are invalid, or the cluster does not exist. sky.exceptions.ClusterNotUpError: if the cluster is not UP. sky.exceptions.NotSupportedError: if the specified cluster is a - reserved cluster that does not support this operation. + controller that does not support this operation. sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is not the same as the user who created the cluster. sky.exceptions.CloudUserIdentityError: if we fail to get the current user identity. """ - controller_utils.check_cluster_name_not_reserved( + backend_utils.check_cluster_name_not_controller( cluster_name, operation_str='Cancelling jobs') if all and job_ids: @@ -800,8 +798,8 @@ def spot_queue(refresh: bool, stop_msg = '' if not refresh: stop_msg = 'To view the latest job table: sky spot queue --refresh' - controller_status, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SPOT_CONTROLLER, + controller_status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, stopped_message=stop_msg) if (refresh and controller_status in [ @@ -873,12 +871,12 @@ def spot_cancel(name: Optional[str] = None, RuntimeError: failed to cancel the job. """ job_ids = [] if job_ids is None else job_ids - cluster_status, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SPOT_CONTROLLER, + cluster_status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, stopped_message='All managed spot jobs should have finished.') if handle is None or handle.head_ip is None: # The error message is already printed in - # controller_utils.is_controller_up + # backend_utils.is_controller_up # TODO(zhwu): Move the error message into the exception. with ux_utils.print_exception_no_traceback(): raise exceptions.ClusterNotUpError(message='', @@ -933,8 +931,8 @@ def spot_tail_logs(name: Optional[str], job_id: Optional[int], sky.exceptions.ClusterNotUpError: the spot controller is not up. """ # TODO(zhwu): Automatically restart the spot controller - controller_status, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SPOT_CONTROLLER, + controller_status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SPOT_CONTROLLER, stopped_message=('Please restart the spot controller with ' f'`sky start {spot.SPOT_CONTROLLER_NAME}`.')) if handle is None or handle.head_ip is None: @@ -1069,8 +1067,8 @@ def serve_status( # TODO(tian): This is so slow... It will take ~10s to refresh the status # of controller. Can we optimize this? - controller_status, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, + controller_status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='No service is found.') if handle is None or handle.head_ip is None: @@ -1086,7 +1084,7 @@ def serve_status( backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) - code = serve.ServeCodeGen.get_serve_status(service_names) + code = serve.ServeCodeGen.get_service_status(service_names) returncode, serve_status_payload, stderr = backend.run_on_head( handle, code, @@ -1126,12 +1124,12 @@ def serve_down(service_names: Optional[Union[str, List[str]]] = None, service_names = [] if isinstance(service_names, str): service_names = [service_names] - cluster_status, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, + cluster_status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='All services should have terminated.') if handle is None or handle.head_ip is None: # The error message is already printed in - # controller_utils.is_controller_up + # backend_utils.is_controller_up # TODO(zhwu): Move the error message into the exception. with ux_utils.print_exception_no_traceback(): raise exceptions.ClusterNotUpError(message='', @@ -1230,8 +1228,8 @@ def serve_tail_logs( with ux_utils.print_exception_no_traceback(): raise ValueError('`replica_id` must be None when using ' 'target=CONTROLLER/LOAD_BALANCER.') - controller_status, handle = controller_utils.is_controller_up( - controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, + controller_status, handle = backend_utils.is_controller_up( + controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='No service is found.') if handle is None or handle.head_ip is None: msg = 'No service is found.' diff --git a/sky/data/storage_utils.py b/sky/data/storage_utils.py index 916af13a77b..044e00f5aeb 100644 --- a/sky/data/storage_utils.py +++ b/sky/data/storage_utils.py @@ -1,12 +1,14 @@ """Utility functions for the storage module.""" import os import subprocess -from typing import List +from typing import Any, Dict, List import colorama from sky import exceptions from sky import sky_logging +from sky.utils import log_utils +from sky.utils.cli_utils import status_utils logger = sky_logging.init_logger(__name__) @@ -17,6 +19,49 @@ 'due to the following error: {error_msg!r}') +def format_storage_table(storages: List[Dict[str, Any]], + show_all: bool = False) -> str: + """Format the storage table for display. + + Args: + storage_table (dict): The storage table. + + Returns: + str: The formatted storage table. + """ + storage_table = log_utils.create_table([ + 'NAME', + 'UPDATED', + 'STORE', + 'COMMAND', + 'STATUS', + ]) + + for row in storages: + launched_at = row['launched_at'] + if show_all: + command = row['last_use'] + else: + command = status_utils.truncate_long_string( + row['last_use'], status_utils.COMMAND_TRUNC_LENGTH) + storage_table.add_row([ + # NAME + row['name'], + # LAUNCHED + log_utils.readable_time_duration(launched_at), + # CLOUDS + ', '.join([s.value for s in row['store']]), + # COMMAND, + command, + # STATUS + row['status'].value, + ]) + if storages: + return str(storage_table) + else: + return 'No existing storage.' + + def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]: """ Lists files and patterns ignored by git in the source directory diff --git a/sky/execution.py b/sky/execution.py index 0e58b72ee94..675a2902884 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -381,8 +381,7 @@ def _execute( backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: - controller = controller_utils.Controllers.check_cluster_name( - cluster_name) + controller = backend_utils.Controllers.from_name(cluster_name) if controller is None and not _is_launched_by_sky_serve_controller: # UX: print live clusters to make users aware (to save costs). # @@ -512,8 +511,8 @@ def launch( if dryrun. """ entrypoint = task - controller_utils.check_cluster_name_not_reserved(cluster_name, - operation_str='sky.launch') + backend_utils.check_cluster_name_not_controller(cluster_name, + operation_str='sky.launch') return _execute( entrypoint=entrypoint, @@ -589,7 +588,7 @@ def exec( # pylint: disable=redefined-builtin ValueError: if the specified cluster does not exist or is not in UP status. sky.exceptions.NotSupportedError: if the specified cluster is a - reserved cluster that does not support this operation. + controller that does not support this operation. Returns: job_id: Optional[int]; the job ID of the submitted job. None if the @@ -604,8 +603,8 @@ def exec( # pylint: disable=redefined-builtin f'{colorama.Fore.YELLOW}Passing a sky.Dag to sky.exec() is ' 'deprecated. Pass sky.Task instead.' f'{colorama.Style.RESET_ALL}') - controller_utils.check_cluster_name_not_reserved(cluster_name, - operation_str='sky.exec') + backend_utils.check_cluster_name_not_controller(cluster_name, + operation_str='sky.exec') handle = backend_utils.check_cluster_available( cluster_name, diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 532db48e0ba..b9a91447ebc 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -259,7 +259,7 @@ def update_service_status() -> None: record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED) -def _get_serve_status( +def _get_service_status( service_name: str, with_replica_info: bool = True) -> Optional[Dict[str, Any]]: """Get the status dict of the service. @@ -283,13 +283,13 @@ def _get_serve_status( return record -def get_serve_status_encoded(service_names: Optional[List[str]]) -> str: +def get_service_status_encoded(service_names: Optional[List[str]]) -> str: serve_statuses = [] if service_names is None: # Get all service names service_names = serve_state.get_glob_service_names(None) for service_name in service_names: - serve_status = _get_serve_status(service_name) + serve_status = _get_service_status(service_name) if serve_status is None: continue serve_statuses.append({ @@ -314,7 +314,8 @@ def terminate_services(service_names: Optional[List[str]]) -> str: service_names = serve_state.get_glob_service_names(service_names) terminated_service_names = [] for service_name in service_names: - serve_status = _get_serve_status(service_name, with_replica_info=False) + serve_status = _get_service_status(service_name, + with_replica_info=False) assert serve_status is not None if (serve_status['status'] in serve_state.ServiceStatus.refuse_to_terminate_statuses()): @@ -561,7 +562,7 @@ class ServeCodeGen: """Code generator for SkyServe. Usage: - >> code = ServeCodeGen.get_serve_status(service_name) + >> code = ServeCodeGen.get_service_status(service_name) """ _PREFIX = [ 'from sky.serve import serve_state', @@ -569,9 +570,9 @@ class ServeCodeGen: ] @classmethod - def get_serve_status(cls, service_names: Optional[List[str]]) -> str: + def get_service_status(cls, service_names: Optional[List[str]]) -> str: code = [ - f'msg = serve_utils.get_serve_status_encoded({service_names!r})', + f'msg = serve_utils.get_service_status_encoded({service_names!r})', 'print(msg, end="", flush=True)' ] return cls._build(code) diff --git a/sky/serve/service.py b/sky/serve/service.py index b67493b16b9..1cd2bc18b22 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -108,7 +108,8 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: info) failed = True logger.error(f'Replica {info.replica_id} failed to terminate.') - failed = failed or _cleanup_storage(task_yaml) + if _cleanup_storage(task_yaml): + failed = True return failed diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index cbf7d960f53..dc8bec5c237 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -194,49 +194,6 @@ def format_replica_table(replica_records: List[_ReplicaRecord], return f'{replica_table}{truncate_hint}' -def format_storage_table(storages: List[Dict[str, Any]], - show_all: bool = False) -> str: - """Format the storage table for display. - - Args: - storage_table (dict): The storage table. - - Returns: - str: The formatted storage table. - """ - storage_table = log_utils.create_table([ - 'NAME', - 'UPDATED', - 'STORE', - 'COMMAND', - 'STATUS', - ]) - - for row in storages: - launched_at = row['launched_at'] - if show_all: - command = row['last_use'] - else: - command = truncate_long_string(row['last_use'], - COMMAND_TRUNC_LENGTH) - storage_table.add_row([ - # NAME - row['name'], - # LAUNCHED - log_utils.readable_time_duration(launched_at), - # CLOUDS - ', '.join([s.value for s in row['store']]), - # COMMAND, - command, - # STATUS - row['status'].value, - ]) - if storages: - return str(storage_table) - else: - return 'No existing storage.' - - def get_total_cost_of_displayed_records( cluster_records: List[_ClusterCostReportRecord], display_all: bool): """Compute total cost of records to be displayed in cost report.""" @@ -341,8 +298,10 @@ def show_local_status_table(local_clusters: List[str]): `sky launch`. Sky understands what types of resources are on the nodes and has ran at least one job on the cluster. """ - clusters_status = controller_utils.get_non_reserved_clusters( - refresh=False, cloud_filter=backend_utils.CloudFilter.LOCAL) + clusters_status = backend_utils.get_clusters( + include_controller=False, + refresh=False, + cloud_filter=backend_utils.CloudFilter.LOCAL) columns = [ 'NAME', 'USER', @@ -463,7 +422,7 @@ def _get_replicas(service_record: _ServiceRecord) -> str: def get_endpoint(service_record: _ServiceRecord) -> str: - # Don't use controller_utils.is_controller_up since it is too slow. + # Don't use backend_utils.is_controller_up since it is too slow. handle = global_user_state.get_handle_from_cluster_name( serve.SKY_SERVE_CONTROLLER_NAME) assert isinstance(handle, backends.CloudVmRayResourceHandle) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 943bce55630..48cbbb1c6a5 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -1,24 +1,13 @@ """Util constants/functions for SkyPilot Controllers.""" -import dataclasses -import enum import os import typing -from typing import Any, Dict, List, Optional, Tuple, Union - -import colorama +from typing import Optional from sky import exceptions -from sky import global_user_state -from sky import serve from sky import sky_logging -from sky import spot -from sky import status_lib -from sky.backends import backend_utils from sky.utils import common_utils -from sky.utils import ux_utils if typing.TYPE_CHECKING: - from sky import backends from sky.backends import cloud_vm_ray_backend logger = sky_logging.init_logger(__name__) @@ -28,163 +17,6 @@ CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 -@dataclasses.dataclass -class _ControllerSpec: - """Spec for skypilot controllers.""" - name: str - cluster_name: str - in_progress_hint: str - decline_cancel_hint: str - decline_down_in_init_status_hint: str - decline_down_for_dirty_controller_hint: str - check_cluster_name_hint: str - default_hint_if_non_existent: str - - -class Controllers(enum.Enum): - """Skypilot controllers.""" - # NOTE(dev): Keep this align with - # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE - SPOT_CONTROLLER = _ControllerSpec( - name='managed spot controller', - cluster_name=spot.SPOT_CONTROLLER_NAME, - in_progress_hint=( - '* {job_info}To see all spot jobs: ' - f'{colorama.Style.BRIGHT}sky spot queue{colorama.Style.RESET_ALL}'), - decline_cancel_hint=( - 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' - f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel [--all]{colorama.Style.RESET_ALL}'), - decline_down_in_init_status_hint=( - f'{colorama.Fore.RED}Tearing down the spot controller while ' - 'it is in INIT state is not supported (this means a spot launch ' - 'is in progress or the previous launch failed), as we cannot ' - 'guarantee that all the spot jobs are finished. Please wait ' - 'until the spot controller is UP or fix it with ' - f'{colorama.Style.BRIGHT}sky start ' - f'{spot.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), - decline_down_for_dirty_controller_hint=( - f'{colorama.Fore.RED}In-progress spot jobs found. To avoid ' - f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}' - f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), - check_cluster_name_hint=( - f'Cluster {spot.SPOT_CONTROLLER_NAME} is reserved for ' - 'managed spot controller. '), - default_hint_if_non_existent='No managed spot jobs are found.') - SKY_SERVE_CONTROLLER = _ControllerSpec( - name='sky serve controller', - cluster_name=serve.SKY_SERVE_CONTROLLER_NAME, - in_progress_hint=( - f'* To see detailed service status: {colorama.Style.BRIGHT}' - f'sky serve status -a{colorama.Style.RESET_ALL}'), - decline_cancel_hint=( - 'Cancelling the sky serve controller\'s jobs is not allowed.'), - decline_down_in_init_status_hint=( - f'{colorama.Fore.RED}Tearing down the sky serve controller ' - 'while it is in INIT state is not supported (this means a sky ' - 'serve up is in progress or the previous launch failed), as we ' - 'cannot guarantee that all the services are terminated. Please ' - 'wait until the sky serve controller is UP or fix it with ' - f'{colorama.Style.BRIGHT}sky start ' - f'{serve.SKY_SERVE_CONTROLLER_NAME}' - f'{colorama.Style.RESET_ALL}.'), - decline_down_for_dirty_controller_hint=( - f'{colorama.Fore.RED}Tearing down the sky serve controller is not ' - 'supported, as it is currently serving the following services: ' - '{service_names}. Please terminate the services first with ' - f'{colorama.Style.BRIGHT}sky serve down -a' - f'{colorama.Style.RESET_ALL}.'), - check_cluster_name_hint=( - f'Cluster {serve.SKY_SERVE_CONTROLLER_NAME} is reserved for ' - 'sky serve controller. '), - default_hint_if_non_existent='No service is found.') - - @classmethod - def check_cluster_name(cls, name: Optional[str]) -> Optional['Controllers']: - """Check if the cluster name is a controller name. - - Returns: - The controller if the cluster name is a controller name. - Otherwise, returns None. - """ - for controller in cls: - if controller.value.cluster_name == name: - return controller - return None - - -def is_controller_up( - controller_type: Controllers, - stopped_message: str, - non_existent_message: Optional[str] = None, -) -> Tuple[Optional[status_lib.ClusterStatus], - Optional['backends.CloudVmRayResourceHandle']]: - """Check if the spot/serve controller is up. - - It can be used to check the actual controller status (since the autostop is - set for the controller) before the spot/serve commands interact with the - controller. - - Args: - type: Type of the controller. - stopped_message: Message to print if the controller is STOPPED. - non_existent_message: Message to show if the controller does not exist. - - Returns: - controller_status: The status of the controller. If it fails during - refreshing the status, it will be the cached status. None if the - controller does not exist. - handle: The ResourceHandle of the controller. None if the - controller is not UP or does not exist. - - Raises: - exceptions.ClusterOwnerIdentityMismatchError: if the current user is not - the same as the user who created the cluster. - exceptions.CloudUserIdentityError: if we fail to get the current user - identity. - """ - if non_existent_message is None: - non_existent_message = ( - controller_type.value.default_hint_if_non_existent) - cluster_name = controller_type.value.cluster_name - controller_name = controller_type.value.name.replace(' controller', '') - try: - # Set force_refresh_statuses=None to make sure the refresh only happens - # when the controller is INIT/UP (triggered in these statuses as the - # autostop is always set for the controller). This optimization avoids - # unnecessary costly refresh when the controller is already stopped. - # This optimization is based on the assumption that the user will not - # start the controller manually from the cloud console. - controller_status, handle = backend_utils.refresh_cluster_status_handle( - cluster_name, force_refresh_statuses=None) - except exceptions.ClusterStatusFetchingError as e: - # We do not catch the exceptions related to the cluster owner identity - # mismatch, please refer to the comment in - # `backend_utils.check_cluster_available`. - logger.warning( - 'Failed to get the status of the controller. It is not ' - f'fatal, but {controller_name} commands/calls may hang or return ' - 'stale information, when the controller is not up.\n' - f' Details: {common_utils.format_exception(e, use_bracket=True)}') - record = global_user_state.get_cluster_from_name(cluster_name) - controller_status, handle = None, None - if record is not None: - controller_status, handle = record['status'], record['handle'] - - if controller_status is None: - sky_logging.print(non_existent_message) - elif controller_status != status_lib.ClusterStatus.UP: - msg = (f'{controller_name.capitalize()} controller {cluster_name} ' - f'is {controller_status.value}.') - if controller_status == status_lib.ClusterStatus.STOPPED: - msg += f'\n{stopped_message}' - if controller_status == status_lib.ClusterStatus.INIT: - msg += '\nPlease wait for the controller to be ready.' - sky_logging.print(msg) - handle = None - return controller_status, handle - - # Internal only: def download_and_stream_latest_job_log( backend: 'cloud_vm_ray_backend.CloudVmRayBackend', @@ -225,44 +57,3 @@ def download_and_stream_latest_job_log( logger.error('Failed to find the logs for the user ' f'program at {log_file}.') return log_file - - -def get_non_reserved_clusters( - refresh: bool, - cloud_filter: backend_utils.CloudFilter = backend_utils.CloudFilter. - CLOUDS_AND_DOCKER, - cluster_names: Optional[Union[str, List[str]]] = None, -) -> List[Dict[str, Any]]: - """Wrapper for the backend_utils.get_clusters without reserved clusters.""" - records = backend_utils.get_clusters(refresh=refresh, - cloud_filter=cloud_filter, - cluster_names=cluster_names) - records = [ - record for record in records - if Controllers.check_cluster_name(record['name']) is None - ] - return records - - -def check_cluster_name_not_reserved( - cluster_name: Optional[str], - operation_str: Optional[str] = None) -> None: - """Errors out if the cluster name is reserved. - - Currently, all reserved cluster names are skypilot controller, i.e. - spot controller/sky serve controller. - - Raises: - sky.exceptions.NotSupportedError: if the cluster name is reserved, raise - with an error message explaining 'operation_str' is not allowed. - - Returns: - None, if the cluster name is not reserved. - """ - controller = Controllers.check_cluster_name(cluster_name) - if controller is not None: - msg = controller.value.check_cluster_name_hint - if operation_str is not None: - msg += f' {operation_str} is not allowed.' - with ux_utils.print_exception_no_traceback(): - raise exceptions.NotSupportedError(msg) diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 2419a7cfa7a..135e5af3932 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -332,30 +332,28 @@ def get_config_schema(): # pylint: disable=import-outside-toplevel from sky.utils import kubernetes_enums - def _get_controller_resources_schema(is_serve: bool = False): - resources_schema = { - k: v - for k, v in get_resources_schema().items() - # Validation may fail if $schema is included. - if k != '$schema' - } - if is_serve: - resources_schema['properties'].pop('ports') - return { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'controller': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'resources': resources_schema, - } - }, - } + resources_schema = { + k: v + for k, v in get_resources_schema().items() + # Validation may fail if $schema is included. + if k != '$schema' + } + resources_schema['properties'].pop('ports') + controller_resources_schema = { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'controller': { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'resources': resources_schema, + } + }, } + } return { '$schema': 'https://json-schema.org/draft/2020-12/schema', @@ -363,8 +361,8 @@ def _get_controller_resources_schema(is_serve: bool = False): 'required': [], 'additionalProperties': False, 'properties': { - 'spot': _get_controller_resources_schema(), - 'serve': _get_controller_resources_schema(is_serve=True), + 'spot': controller_resources_schema, + 'serve': controller_resources_schema, 'aws': { 'type': 'object', 'required': [], diff --git a/tests/test_jobs.py b/tests/test_jobs.py index a1be762506d..47416af5d91 100644 --- a/tests/test_jobs.py +++ b/tests/test_jobs.py @@ -8,7 +8,7 @@ class TestExecutionOnExistingClusters: - """Test operations on reserved clusters.""" + """Test operations on existing clusters.""" @pytest.fixture def _mock_db_conn(self, monkeypatch, tmp_path): diff --git a/tests/test_spot.py b/tests/test_spot.py index e5bb5317012..9c7b0e3df81 100644 --- a/tests/test_spot.py +++ b/tests/test_spot.py @@ -29,8 +29,8 @@ def test_spot_nonexist_strategy(): sky.Task.from_yaml(f.name) -class TestReservedClustersOperations: - """Test operations on reserved clusters.""" +class TestControllerOperations: + """Test operations on controllers.""" @pytest.fixture def _mock_db_conn(self, monkeypatch, tmp_path): @@ -147,9 +147,8 @@ def test_stop_spot_controller(self, _mock_cluster_state): cli_runner = cli_testing.CliRunner() result = cli_runner.invoke(cli.stop, [spot.SPOT_CONTROLLER_NAME]) assert result.exit_code == click.UsageError.exit_code - assert ( - f'Stopping reserved cluster(s) \'{spot.SPOT_CONTROLLER_NAME}\' is ' - 'currently not supported' in result.output) + assert (f'Stopping controller(s) \'{spot.SPOT_CONTROLLER_NAME}\' is ' + 'currently not supported' in result.output) result = cli_runner.invoke(cli.stop, ['sky-spot-con*']) assert not result.exception @@ -164,7 +163,7 @@ def test_autostop_spot_controller(self, _mock_cluster_state): cli_runner = cli_testing.CliRunner() result = cli_runner.invoke(cli.autostop, [spot.SPOT_CONTROLLER_NAME]) assert result.exit_code == click.UsageError.exit_code - assert ('Scheduling autostop on reserved cluster(s) ' + assert ('Scheduling autostop on controller(s) ' f'\'{spot.SPOT_CONTROLLER_NAME}\' is currently not supported' in result.output) From 869eb75c62275bbe52b97b179c1f19c89730d9d5 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 11 Nov 2023 01:44:21 -0800 Subject: [PATCH 202/223] stream logs --- sky/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/core.py b/sky/core.py index a860661ca39..e5400fab5c1 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1097,7 +1097,7 @@ def serve_status( code, 'Failed to fetch services', stderr, - stream_logs=False) + stream_logs=True) except exceptions.CommandError as e: raise RuntimeError(e.error_msg) from e From 9e5c69e39aca526a7645300351bdc95c02d5b51e Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 11 Nov 2023 01:55:40 -0800 Subject: [PATCH 203/223] fix not showing controller --- sky/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/core.py b/sky/core.py index e5400fab5c1..36dd0b5810c 100644 --- a/sky/core.py +++ b/sky/core.py @@ -106,7 +106,7 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None, cluster. If a cluster is found to be terminated or not found, it will be omitted from the returned list. """ - return backend_utils.get_clusters(include_controller=False, + return backend_utils.get_clusters(include_controller=True, refresh=refresh, cluster_names=cluster_names) From d14b5cf2c274a0e13d0de9d475068fdebc45be38 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 11 Nov 2023 21:19:50 -0800 Subject: [PATCH 204/223] move controllers back to controller_utils --- sky/backends/backend_utils.py | 116 +-------------------------- sky/backends/cloud_vm_ray_backend.py | 7 +- sky/cli.py | 41 +++++----- sky/core.py | 20 ++--- sky/execution.py | 10 +-- sky/utils/controller_utils.py | 114 ++++++++++++++++++++++++++ 6 files changed, 157 insertions(+), 151 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 03b17590b15..886dc356ba1 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1,5 +1,4 @@ """Util constants/functions for the backends.""" -import dataclasses from datetime import datetime import enum import getpass @@ -39,7 +38,6 @@ from sky import serve as serve_lib from sky import sky_logging from sky import skypilot_config -from sky import spot as spot_lib from sky import status_lib from sky.backends import onprem_utils from sky.provision import instance_setup @@ -48,6 +46,7 @@ from sky.usage import usage_lib from sky.utils import command_runner from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import env_options from sky.utils import rich_utils from sky.utils import subprocess_utils @@ -153,92 +152,6 @@ ] -# TODO(tian): Refactor to controller_utils. Current blocker: circular import. -@dataclasses.dataclass -class _ControllerSpec: - """Spec for skypilot controllers.""" - name: str - cluster_name: str - in_progress_hint: str - decline_cancel_hint: str - decline_down_in_init_status_hint: str - decline_down_for_dirty_controller_hint: str - check_cluster_name_hint: str - default_hint_if_non_existent: str - - -class Controllers(enum.Enum): - """Skypilot controllers.""" - # NOTE(dev): Keep this align with - # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE - SPOT_CONTROLLER = _ControllerSpec( - name='managed spot controller', - cluster_name=spot_lib.SPOT_CONTROLLER_NAME, - in_progress_hint=( - '* {job_info}To see all spot jobs: ' - f'{colorama.Style.BRIGHT}sky spot queue{colorama.Style.RESET_ALL}'), - decline_cancel_hint=( - 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' - f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel [--all]{colorama.Style.RESET_ALL}'), - decline_down_in_init_status_hint=( - f'{colorama.Fore.RED}Tearing down the spot controller while ' - 'it is in INIT state is not supported (this means a spot launch ' - 'is in progress or the previous launch failed), as we cannot ' - 'guarantee that all the spot jobs are finished. Please wait ' - 'until the spot controller is UP or fix it with ' - f'{colorama.Style.BRIGHT}sky start ' - f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), - decline_down_for_dirty_controller_hint=( - f'{colorama.Fore.RED}In-progress spot jobs found. To avoid ' - f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}' - f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), - check_cluster_name_hint=( - f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' - 'managed spot controller. '), - default_hint_if_non_existent='No managed spot jobs are found.') - SKY_SERVE_CONTROLLER = _ControllerSpec( - name='sky serve controller', - cluster_name=serve_lib.SKY_SERVE_CONTROLLER_NAME, - in_progress_hint=( - f'* To see detailed service status: {colorama.Style.BRIGHT}' - f'sky serve status -a{colorama.Style.RESET_ALL}'), - decline_cancel_hint=( - 'Cancelling the sky serve controller\'s jobs is not allowed.'), - decline_down_in_init_status_hint=( - f'{colorama.Fore.RED}Tearing down the sky serve controller ' - 'while it is in INIT state is not supported (this means a sky ' - 'serve up is in progress or the previous launch failed), as we ' - 'cannot guarantee that all the services are terminated. Please ' - 'wait until the sky serve controller is UP or fix it with ' - f'{colorama.Style.BRIGHT}sky start ' - f'{serve_lib.SKY_SERVE_CONTROLLER_NAME}' - f'{colorama.Style.RESET_ALL}.'), - decline_down_for_dirty_controller_hint=( - f'{colorama.Fore.RED}Tearing down the sky serve controller is not ' - 'supported, as it is currently serving the following services: ' - '{service_names}. Please terminate the services first with ' - f'{colorama.Style.BRIGHT}sky serve down -a' - f'{colorama.Style.RESET_ALL}.'), - check_cluster_name_hint=( - f'Cluster {serve_lib.SKY_SERVE_CONTROLLER_NAME} is reserved for ' - 'sky serve controller. '), - default_hint_if_non_existent='No service is found.') - - @classmethod - def from_name(cls, name: Optional[str]) -> Optional['Controllers']: - """Check if the cluster name is a controller name. - - Returns: - The controller if the cluster name is a controller name. - Otherwise, returns None. - """ - for controller in cls: - if controller.value.cluster_name == name: - return controller - return None - - def is_ip(s: str) -> bool: """Returns whether this string matches IP_ADDR_REGEX.""" return len(re.findall(IP_ADDR_REGEX, s)) == 1 @@ -2589,7 +2502,7 @@ def check_cluster_available( # TODO(tian): Refactor to controller_utils. Current blocker: circular import. def is_controller_up( - controller_type: Controllers, + controller_type: controller_utils.Controllers, stopped_message: str, non_existent_message: Optional[str] = None, ) -> Tuple[Optional[status_lib.ClusterStatus], @@ -2702,7 +2615,7 @@ def get_clusters( if not include_controller: records = [ record for record in records - if Controllers.from_name(record['name']) is None + if controller_utils.Controllers.from_name(record['name']) is None ] yellow = colorama.Fore.YELLOW @@ -2891,29 +2804,6 @@ def get_task_resources_str(task: 'task_lib.Task') -> str: return resources_str -# TODO(tian): Refactor to controller_utils. Current blocker: circular import. -def check_cluster_name_not_controller( - cluster_name: Optional[str], - operation_str: Optional[str] = None) -> None: - """Errors out if the cluster name is a controller name. - - Raises: - sky.exceptions.NotSupportedError: if the cluster name is a controller - name, raise with an error message explaining 'operation_str' is not - allowed. - - Returns: - None, if the cluster name is not a controller name. - """ - controller = Controllers.from_name(cluster_name) - if controller is not None: - msg = controller.value.check_cluster_name_hint - if operation_str is not None: - msg += f' {operation_str} is not allowed.' - with ux_utils.print_exception_no_traceback(): - raise exceptions.NotSupportedError(msg) - - # Handle ctrl-c def interrupt_handler(signum, frame): del signum, frame diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 422b9230630..ad64666dc94 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -52,6 +52,7 @@ from sky.usage import usage_lib from sky.utils import command_runner from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import log_utils from sky.utils import resources_utils from sky.utils import rich_utils @@ -3369,8 +3370,8 @@ def _exec_code_on_head( self.tail_logs(handle, job_id) finally: name = handle.cluster_name - controller = backend_utils.Controllers.from_name(name) - if controller == backend_utils.Controllers.SPOT_CONTROLLER: + controller = controller_utils.Controllers.from_name(name) + if controller == controller_utils.Controllers.SPOT_CONTROLLER: logger.info( f'{fore.CYAN}Spot Job ID: ' f'{style.BRIGHT}{job_id}{style.RESET_ALL}' @@ -3513,7 +3514,7 @@ def _post_execute(self, handle: CloudVmRayResourceHandle, fore = colorama.Fore style = colorama.Style name = handle.cluster_name - controller = backend_utils.Controllers.from_name(name) + controller = controller_utils.Controllers.from_name(name) if controller is not None or down: return stop_str = ('\nTo stop the cluster:' diff --git a/sky/cli.py b/sky/cli.py index 62a3afee399..491363303f1 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -66,6 +66,7 @@ from sky.usage import usage_lib from sky.utils import command_runner from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import dag_utils from sky.utils import env_options from sky.utils import kubernetes_utils @@ -1398,7 +1399,7 @@ def launch( """ # NOTE(dev): Keep the docstring consistent between the Python API and CLI. env = _merge_env_vars(env_file, env) - backend_utils.check_cluster_name_not_controller( + controller_utils.check_cluster_name_not_controller( cluster, operation_str='Launching tasks on it') if backend_name is None: backend_name = backends.CloudVmRayBackend.NAME @@ -1568,7 +1569,7 @@ def exec( raise ValueError('`ports` is not supported by `sky exec`.') env = _merge_env_vars(env_file, env) - backend_utils.check_cluster_name_not_controller( + controller_utils.check_cluster_name_not_controller( cluster, operation_str='Executing task on it') handle = global_user_state.get_handle_from_cluster_name(cluster) if handle is None: @@ -1911,7 +1912,7 @@ def status(all: bool, refresh: bool, ip: bool, show_spot_jobs: bool, controllers = [] for cluster_record in cluster_records: cluster_name = cluster_record['name'] - controller = backend_utils.Controllers.from_name(cluster_name) + controller = controller_utils.Controllers.from_name(cluster_name) if controller is not None: controllers.append(cluster_record) else: @@ -1967,7 +1968,7 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: 'shown)') job_info += '. ' hints.append( - backend_utils.Controllers.SPOT_CONTROLLER.value. + controller_utils.Controllers.SPOT_CONTROLLER.value. in_progress_hint.format(job_info=job_info)) if show_services: @@ -1987,7 +1988,7 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: num_services, msg = result click.echo(msg) if num_services is not None: - hints.append(backend_utils.Controllers.SKY_SERVE_CONTROLLER. + hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER. value.in_progress_hint) if show_spot_jobs or show_services: @@ -2049,7 +2050,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin controllers = dict() for cluster_record in cluster_records: cluster_name = cluster_record['name'] - controller = backend_utils.Controllers.from_name(cluster_name) + controller = controller_utils.Controllers.from_name(cluster_name) if controller is not None: controller_name = controller.value.name # to display most recent entry for each controller cluster @@ -2319,7 +2320,7 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) except exceptions.NotSupportedError: - controller = backend_utils.Controllers.from_name(cluster) + controller = controller_utils.Controllers.from_name(cluster) assert controller is not None, cluster click.echo(controller.value.decline_cancel_hint) sys.exit(1) @@ -2613,7 +2614,7 @@ def start( clusters = [ cluster['name'] for cluster in global_user_state.get_clusters() - if backend_utils.Controllers.from_name(cluster['name']) is None + if controller_utils.Controllers.from_name(cluster['name']) is None ] if not clusters: @@ -2681,7 +2682,7 @@ def start( # Checks for controller clusters (spot controller / sky serve controller). controllers, normal_clusters = [], [] for name in to_start: - if backend_utils.Controllers.from_name(name) is not None: + if controller_utils.Controllers.from_name(name) is not None: controllers.append(name) else: normal_clusters.append(name) @@ -2803,7 +2804,7 @@ def _hint_or_raise_for_down_spot_controller(controller_name: str): click.echo('Managed spot controller has already been torn down.') return - controller = backend_utils.Controllers.from_name(controller_name) + controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): @@ -2855,7 +2856,7 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): click.echo('Sky serve controller has already been torn down.') return - controller = backend_utils.Controllers.from_name(controller_name) + controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name if cluster_status == status_lib.ClusterStatus.INIT: with ux_utils.print_exception_no_traceback(): @@ -2881,9 +2882,9 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): _CONTROLLER_TO_HINT_OR_RAISE = { - backend_utils.Controllers.SPOT_CONTROLLER: + controller_utils.Controllers.SPOT_CONTROLLER: (_hint_or_raise_for_down_spot_controller), - backend_utils.Controllers.SKY_SERVE_CONTROLLER: + controller_utils.Controllers.SKY_SERVE_CONTROLLER: (_hint_or_raise_for_down_sky_serve_controller), } @@ -2931,12 +2932,12 @@ def _down_or_stop_clusters( if len(names) > 0: controllers = [ name for name in names - if backend_utils.Controllers.from_name(name) is not None + if controller_utils.Controllers.from_name(name) is not None ] controllers_str = ', '.join(map(repr, controllers)) names = [ name for name in _get_glob_clusters(names) - if backend_utils.Controllers.from_name(name) is None + if controller_utils.Controllers.from_name(name) is None ] if not down: local_clusters = onprem_utils.check_and_get_local_clusters() @@ -2970,7 +2971,7 @@ def _down_or_stop_clusters( f'{operation} controller(s) ' f'{controllers_str} is currently not supported.') else: - controller = backend_utils.Controllers.from_name( + controller = controller_utils.Controllers.from_name( controller_name) assert controller is not None hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] @@ -2996,7 +2997,7 @@ def _down_or_stop_clusters( names = [ record['name'] for record in all_clusters - if backend_utils.Controllers.from_name(record['name']) is None + if controller_utils.Controllers.from_name(record['name']) is None ] clusters = [] @@ -4065,7 +4066,7 @@ def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): $ sky spot cancel 1 2 3 """ _, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message='All managed spot jobs should have finished.') if handle is None: # Hint messages already printed by the call above. @@ -4150,7 +4151,7 @@ def spot_dashboard(port: Optional[int]): 'Dashboard is not available if spot controller is not up. Run a spot ' 'job first.') _, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message=hint, non_existent_message=hint) if handle is None: @@ -4446,7 +4447,7 @@ def serve_down(service_names: List[str], all: bool, yes: bool): f'Provided {argument_str!r}.') _, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='All services should have been terminated.') if handle is None: # Hint messages already printed by the call above. diff --git a/sky/core.py b/sky/core.py index 36dd0b5810c..f9e38338604 100644 --- a/sky/core.py +++ b/sky/core.py @@ -184,7 +184,7 @@ def _start( f'Starting cluster {cluster_name!r} with backend {backend.NAME} ' 'is not supported.') - if backend_utils.Controllers.from_name(cluster_name) is not None: + if controller_utils.Controllers.from_name(cluster_name) is not None: if down: raise ValueError('Using autodown (rather than autostop) is not ' 'supported for SkyPilot controllers. Pass ' @@ -306,7 +306,7 @@ def stop(cluster_name: str, purge: bool = False) -> None: sky.exceptions.NotSupportedError: if the specified cluster is a spot cluster, or a TPU VM Pod cluster, or the managed spot controller. """ - if backend_utils.Controllers.from_name(cluster_name) is not None: + if controller_utils.Controllers.from_name(cluster_name) is not None: raise exceptions.NotSupportedError( f'Stopping SkyPilot controller {cluster_name!r} ' f'is not supported.') @@ -429,7 +429,7 @@ def autostop( if is_cancel: option_str = '{stop,down}' operation = f'{verb} auto{option_str}' - if backend_utils.Controllers.from_name(cluster_name) is not None: + if controller_utils.Controllers.from_name(cluster_name) is not None: raise exceptions.NotSupportedError( f'{operation} SkyPilot controller {cluster_name!r} ' f'is not supported.') @@ -564,7 +564,7 @@ def cancel( sky.exceptions.CloudUserIdentityError: if we fail to get the current user identity. """ - backend_utils.check_cluster_name_not_controller( + controller_utils.check_cluster_name_not_controller( cluster_name, operation_str='Cancelling jobs') if all and job_ids: @@ -799,7 +799,7 @@ def spot_queue(refresh: bool, if not refresh: stop_msg = 'To view the latest job table: sky spot queue --refresh' controller_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message=stop_msg) if (refresh and controller_status in [ @@ -872,7 +872,7 @@ def spot_cancel(name: Optional[str] = None, """ job_ids = [] if job_ids is None else job_ids cluster_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message='All managed spot jobs should have finished.') if handle is None or handle.head_ip is None: # The error message is already printed in @@ -932,7 +932,7 @@ def spot_tail_logs(name: Optional[str], job_id: Optional[int], """ # TODO(zhwu): Automatically restart the spot controller controller_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SPOT_CONTROLLER, + controller_type=controller_utils.Controllers.SPOT_CONTROLLER, stopped_message=('Please restart the spot controller with ' f'`sky start {spot.SPOT_CONTROLLER_NAME}`.')) if handle is None or handle.head_ip is None: @@ -1068,7 +1068,7 @@ def serve_status( # TODO(tian): This is so slow... It will take ~10s to refresh the status # of controller. Can we optimize this? controller_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='No service is found.') if handle is None or handle.head_ip is None: @@ -1125,7 +1125,7 @@ def serve_down(service_names: Optional[Union[str, List[str]]] = None, if isinstance(service_names, str): service_names = [service_names] cluster_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='All services should have terminated.') if handle is None or handle.head_ip is None: # The error message is already printed in @@ -1229,7 +1229,7 @@ def serve_tail_logs( raise ValueError('`replica_id` must be None when using ' 'target=CONTROLLER/LOAD_BALANCER.') controller_status, handle = backend_utils.is_controller_up( - controller_type=backend_utils.Controllers.SKY_SERVE_CONTROLLER, + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, stopped_message='No service is found.') if handle is None or handle.head_ip is None: msg = 'No service is found.' diff --git a/sky/execution.py b/sky/execution.py index 675a2902884..a9a1e0438c1 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -381,7 +381,7 @@ def _execute( backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: - controller = backend_utils.Controllers.from_name(cluster_name) + controller = controller_utils.Controllers.from_name(cluster_name) if controller is None and not _is_launched_by_sky_serve_controller: # UX: print live clusters to make users aware (to save costs). # @@ -511,8 +511,8 @@ def launch( if dryrun. """ entrypoint = task - backend_utils.check_cluster_name_not_controller(cluster_name, - operation_str='sky.launch') + controller_utils.check_cluster_name_not_controller( + cluster_name, operation_str='sky.launch') return _execute( entrypoint=entrypoint, @@ -603,8 +603,8 @@ def exec( # pylint: disable=redefined-builtin f'{colorama.Fore.YELLOW}Passing a sky.Dag to sky.exec() is ' 'deprecated. Pass sky.Task instead.' f'{colorama.Style.RESET_ALL}') - backend_utils.check_cluster_name_not_controller(cluster_name, - operation_str='sky.exec') + controller_utils.check_cluster_name_not_controller(cluster_name, + operation_str='sky.exec') handle = backend_utils.check_cluster_available( cluster_name, diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 48cbbb1c6a5..a8667e7f72b 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -1,11 +1,18 @@ """Util constants/functions for SkyPilot Controllers.""" +import dataclasses +import enum import os import typing from typing import Optional +import colorama + from sky import exceptions +from sky import serve as serve_lib from sky import sky_logging +from sky import spot as spot_lib from sky.utils import common_utils +from sky.utils import ux_utils if typing.TYPE_CHECKING: from sky.backends import cloud_vm_ray_backend @@ -17,6 +24,113 @@ CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 +@dataclasses.dataclass +class _ControllerSpec: + """Spec for skypilot controllers.""" + name: str + cluster_name: str + in_progress_hint: str + decline_cancel_hint: str + decline_down_in_init_status_hint: str + decline_down_for_dirty_controller_hint: str + check_cluster_name_hint: str + default_hint_if_non_existent: str + + +class Controllers(enum.Enum): + """Skypilot controllers.""" + # NOTE(dev): Keep this align with + # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE + SPOT_CONTROLLER = _ControllerSpec( + name='managed spot controller', + cluster_name=spot_lib.SPOT_CONTROLLER_NAME, + in_progress_hint=( + '* {job_info}To see all spot jobs: ' + f'{colorama.Style.BRIGHT}sky spot queue{colorama.Style.RESET_ALL}'), + decline_cancel_hint=( + 'Cancelling the spot controller\'s jobs is not allowed.\nTo cancel ' + f'spot jobs, use: {colorama.Style.BRIGHT}sky spot cancel [--all]{colorama.Style.RESET_ALL}'), + decline_down_in_init_status_hint=( + f'{colorama.Fore.RED}Tearing down the spot controller while ' + 'it is in INIT state is not supported (this means a spot launch ' + 'is in progress or the previous launch failed), as we cannot ' + 'guarantee that all the spot jobs are finished. Please wait ' + 'until the spot controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), + decline_down_for_dirty_controller_hint=( + f'{colorama.Fore.RED}In-progress spot jobs found. To avoid ' + f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}' + f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), + check_cluster_name_hint=( + f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' + 'managed spot controller. '), + default_hint_if_non_existent='No managed spot jobs are found.') + SKY_SERVE_CONTROLLER = _ControllerSpec( + name='sky serve controller', + cluster_name=serve_lib.SKY_SERVE_CONTROLLER_NAME, + in_progress_hint=( + f'* To see detailed service status: {colorama.Style.BRIGHT}' + f'sky serve status -a{colorama.Style.RESET_ALL}'), + decline_cancel_hint=( + 'Cancelling the sky serve controller\'s jobs is not allowed.'), + decline_down_in_init_status_hint=( + f'{colorama.Fore.RED}Tearing down the sky serve controller ' + 'while it is in INIT state is not supported (this means a sky ' + 'serve up is in progress or the previous launch failed), as we ' + 'cannot guarantee that all the services are terminated. Please ' + 'wait until the sky serve controller is UP or fix it with ' + f'{colorama.Style.BRIGHT}sky start ' + f'{serve_lib.SKY_SERVE_CONTROLLER_NAME}' + f'{colorama.Style.RESET_ALL}.'), + decline_down_for_dirty_controller_hint=( + f'{colorama.Fore.RED}Tearing down the sky serve controller is not ' + 'supported, as it is currently serving the following services: ' + '{service_names}. Please terminate the services first with ' + f'{colorama.Style.BRIGHT}sky serve down -a' + f'{colorama.Style.RESET_ALL}.'), + check_cluster_name_hint=( + f'Cluster {serve_lib.SKY_SERVE_CONTROLLER_NAME} is reserved for ' + 'sky serve controller. '), + default_hint_if_non_existent='No service is found.') + + @classmethod + def from_name(cls, name: Optional[str]) -> Optional['Controllers']: + """Check if the cluster name is a controller name. + + Returns: + The controller if the cluster name is a controller name. + Otherwise, returns None. + """ + for controller in cls: + if controller.value.cluster_name == name: + return controller + return None + + +def check_cluster_name_not_controller( + cluster_name: Optional[str], + operation_str: Optional[str] = None) -> None: + """Errors out if the cluster name is a controller name. + + Raises: + sky.exceptions.NotSupportedError: if the cluster name is a controller + name, raise with an error message explaining 'operation_str' is not + allowed. + + Returns: + None, if the cluster name is not a controller name. + """ + controller = Controllers.from_name(cluster_name) + if controller is not None: + msg = controller.value.check_cluster_name_hint + if operation_str is not None: + msg += f' {operation_str} is not allowed.' + with ux_utils.print_exception_no_traceback(): + raise exceptions.NotSupportedError(msg) + + # Internal only: def download_and_stream_latest_job_log( backend: 'cloud_vm_ray_backend.CloudVmRayBackend', From e0049bf13704070adb7f24533a5f50d729d25ecc Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 11 Nov 2023 21:23:10 -0800 Subject: [PATCH 205/223] fix wrong controller resources when controller is exist --- sky/execution.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index a9a1e0438c1..c34b776ac58 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1083,9 +1083,12 @@ def serve_up( vars_to_fill, output_path=controller_file.name) controller_task = task_lib.Task.from_yaml(controller_file.name) - controller_cloud = (requested_resources.cloud - if controller_resources.cloud is None else - controller_resources.cloud) + controller_exist = ( + global_user_state.get_cluster_from_name(controller_name) + is not None) + controller_cloud = ( + requested_resources.cloud if not controller_exist and + controller_resources.cloud is None else controller_resources.cloud) # TODO(tian): Probably run another sky.launch after we get the load # balancer port from the controller? So we don't need to open so many # ports here. Or, we should have a nginx traffic control to refuse From f2eaabc689e1bcf14d5aaf9abbb4252a76d5a6c6 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sat, 11 Nov 2023 21:41:09 -0800 Subject: [PATCH 206/223] use return value to indicate success --- sky/serve/service.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sky/serve/service.py b/sky/serve/service.py index 1cd2bc18b22..617b73299c0 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -64,6 +64,14 @@ def _handle_signal(service_name: str) -> None: def _cleanup_storage(task_yaml: str) -> bool: + """Clean up the storage for the service. + + Args: + task_yaml: The task yaml file. + + Returns: + True if the storage is cleaned up successfully, False otherwise. + """ try: task = task_lib.Task.from_yaml(task_yaml) backend = cloud_vm_ray_backend.CloudVmRayBackend() @@ -73,12 +81,12 @@ def _cleanup_storage(task_yaml: str) -> bool: f'{common_utils.format_exception(e)}') with ux_utils.enable_traceback(): logger.error(f' Traceback: {traceback.format_exc()}') - return True - return False + return False + return True def _cleanup(service_name: str, task_yaml: str) -> bool: - """Clean up the sky serve replicas, storage, and service record.""" + """Clean up all service related resources, i.e. replicas and storage.""" failed = False replica_infos = serve_state.get_replica_infos(service_name) info2proc: Dict[replica_managers.ReplicaInfo, @@ -108,7 +116,8 @@ def _cleanup(service_name: str, task_yaml: str) -> bool: info) failed = True logger.error(f'Replica {info.replica_id} failed to terminate.') - if _cleanup_storage(task_yaml): + success = _cleanup_storage(task_yaml) + if not success: failed = True return failed From 86f6d5fa81fc50ef7939a72808b2d08d808e12f3 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 12 Nov 2023 12:46:03 -0800 Subject: [PATCH 207/223] default controller resources & better error handling --- sky/cli.py | 8 +++ sky/execution.py | 57 +++++++++++++++------- sky/serve/constants.py | 21 +++----- sky/serve/replica_managers.py | 12 ++--- sky/serve/serve_state.py | 6 +-- sky/serve/serve_utils.py | 31 ++++++++---- sky/serve/service.py | 31 ++++-------- sky/templates/sky-serve-controller.yaml.j2 | 2 +- 8 files changed, 93 insertions(+), 75 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 491363303f1..70e2bd6d119 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4339,6 +4339,10 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): - User code failed. + - ``FAILED_CLEANUP``: Some error occurred while the service was being shut + down. This usually indicates resource leakages. If you see such status, + please login to the cloud console and double-check + Each replica can have one of the following statuses: - ``PENDING``: The maximum number of simultaneous launches has been reached @@ -4372,6 +4376,10 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): did not finish correctly. When seeing this status, please login to the cloud console and check whether there are some leaked VMs/resources. + - ``PREEMPTED``: The replica was preempted by the cloud provider and sky + serve is recovering this replica. This only happens when the replica is + a spot instance. + Examples: .. code-block:: bash diff --git a/sky/execution.py b/sky/execution.py index c34b776ac58..f1f9b0298f1 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1124,6 +1124,9 @@ def serve_up( retry_until_up=True, ) + style = colorama.Style + fore = colorama.Fore + assert controller_job_id is not None and controller_handle is not None # TODO(tian): Cache endpoint locally to speedup. Endpoint won't # change after the first time, so there is no consistency issue. @@ -1143,47 +1146,65 @@ def serve_up( code, require_outputs=True, stream_logs=False) + try: subprocess_utils.handle_returncode( returncode, code, 'Failed to wait for service initialization', lb_port_payload) - lb_port = serve.load_service_initialization_result(lb_port_payload) - if lb_port is None: + except exceptions.CommandError: + statuses = backend.get_job_status(controller_handle, + [controller_job_id], + stream_logs=False) + controller_job_status = list(statuses.values())[0] + if controller_job_status == sky.JobStatus.PENDING: + # Max number of services reached due to vCPU constraint. + # The controller job is pending due to ray job scheduling. + # We manually cancel the job here. + backend.cancel_jobs(controller_handle, [controller_job_id]) + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'Max number of services reached. ' + 'To spin up more services, please ' + 'tear down some existing services.') from None + else: + # Possible cases: + # (1) name conflict; + # (2) max number of services reached due to memory + # constraint. The job will successfully run on the + # controller, but there will be an error thrown due + # to memory constraint check in the controller. + # See sky/serve/service.py for more details. with ux_utils.print_exception_no_traceback(): raise RuntimeError( - f'The service {service_name!r} is already running. ' - 'Please specify a different name for your service. ' - 'To update an existing service, run: `sky serve down` ' - 'and then `sky serve up` again (in-place update will ' - 'be supported in the future).') + 'Failed to spin up the service. Please ' + 'check the logs above for more details.') from None + else: + lb_port = serve.load_service_initialization_result(lb_port_payload) endpoint = f'{controller_handle.head_ip}:{lb_port}' - sn = service_name - style = colorama.Style - fore = colorama.Fore sky_logging.print( f'{fore.CYAN}Service name: ' - f'{style.BRIGHT}{sn}{style.RESET_ALL}' - f'\n{fore.CYAN}Endpoint URL: ' + f'{style.BRIGHT}{service_name}{style.RESET_ALL}' + f'{fore.CYAN}Endpoint URL: ' f'{style.BRIGHT}{endpoint}{style.RESET_ALL}' '\nTo see detailed info:\t\t' - f'{backend_utils.BOLD}sky serve status {sn} ' + f'{backend_utils.BOLD}sky serve status {service_name} ' f'[--endpoint]{backend_utils.RESET_BOLD}' '\nTo teardown the service:\t' - f'{backend_utils.BOLD}sky serve down {sn}' + f'{backend_utils.BOLD}sky serve down {service_name}' f'{backend_utils.RESET_BOLD}' '\n' '\nTo see logs of a replica:\t' - f'{backend_utils.BOLD}sky serve logs {sn} [REPLICA_ID]' + f'{backend_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]' f'{backend_utils.RESET_BOLD}' '\nTo see logs of load balancer:\t' - f'{backend_utils.BOLD}sky serve logs --load-balancer {sn}' + f'{backend_utils.BOLD}sky serve logs --load-balancer {service_name}' f'{backend_utils.RESET_BOLD}' '\nTo see logs of controller:\t' - f'{backend_utils.BOLD}sky serve logs --controller {sn}' + f'{backend_utils.BOLD}sky serve logs --controller {service_name}' f'{backend_utils.RESET_BOLD}' '\n' '\nTo monitor replica status:\t' - f'{backend_utils.BOLD}watch -n10 sky serve status {sn}' + f'{backend_utils.BOLD}watch -n10 sky serve status {service_name}' f'{backend_utils.RESET_BOLD}' '\nTo send a test request:\t\t' f'{backend_utils.BOLD}curl -L {endpoint}' diff --git a/sky/serve/constants.py b/sky/serve/constants.py index d99d7a95227..1b010b7f590 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -40,26 +40,17 @@ # scale up/down is within this cooldown time. AUTOSCALER_COOLDOWN_SECONDS = 60 -# The default controller resources. -# We need 200 GB disk space to enable using Azure as controller, since its image -# size is 150 GB. Also, we need 32 GB memory to run our controller and load -# balancer jobs since it is very memory demanding. +# The default controller resources. We need 200 GB disk space to enable using +# Azure as controller, since its default image size is 150 GB. # TODO(tian): We might need to be careful that service logs can take a lot of # disk space. Maybe we could use a larger disk size, migrate to cloud storage or # do some log rotation. -CONTROLLER_RESOURCES = {'disk_size': 200, 'memory': '32+'} +CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200} -# Our ray jobs is very memory demanding and number of services on a single -# controller is limited by memory. Rough benchmark result shows each service -# needs ~0.6 GB to run only for controller and load balancer process. -# Considering there will be some sky launch and sky down process on the fly, we -# set the memory usage to 1 GB to be safe. -# In this setup, a default highmem controller with 4 vCPU and 32 GB memory can -# run 32 services. -# TODO(tian): Since now we only have one job, we set this to 1 GB. Should do -# some benchmark to make sure this is safe. +# A default controller with 4 vCPU and 16 GB memory can run up to 16 services. +# TODO(tian): Stress test for 16 services on default controller resources. SERVICES_MEMORY_USAGE_GB = 1.0 -SERVICES_TASK_CPU_DEMAND = 0.125 +SERVICES_TASK_CPU_DEMAND = 0.25 # A period of time to initialize your service. Any readiness probe failures # during this period will be ignored. diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 11351cbdc72..3c94476ed5d 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -507,12 +507,12 @@ def _launch_replica(self, replica_id: int) -> None: ).run, args=(self._task_yaml_path, cluster_name), ) - # Don't start right now; we will start it later in _refresh_process_pool - # to avoid too many sky.launch running at the same time. - self._launch_process_pool[replica_id] = p replica_port = _get_resources_ports(self._task_yaml_path) info = ReplicaInfo(replica_id, cluster_name, replica_port) serve_state.add_or_update_replica(self._service_name, replica_id, info) + # Don't start right now; we will start it later in _refresh_process_pool + # to avoid too many sky.launch running at the same time. + self._launch_process_pool[replica_id] = p def scale_up(self, n: int) -> None: for _ in range(n): @@ -576,17 +576,17 @@ def _download_and_stream_logs(info: ReplicaInfo): ).run, args=(info.cluster_name,), ) - p.start() - self._down_process_pool[replica_id] = p info.status_property.sky_down_status = ProcessStatus.RUNNING serve_state.add_or_update_replica(self._service_name, replica_id, info) + p.start() + self._down_process_pool[replica_id] = p def scale_down(self, replica_ids: List[int]) -> None: for replica_id in replica_ids: self._terminate_replica(replica_id, sync_down_logs=False) def _handle_preemption(self, replica_id: int) -> None: - logger.info(f'Beginning recovery for preempted replica {replica_id}.') + logger.info(f'Beginning handle for preempted replica {replica_id}.') # TODO(MaoZiming): Support spot recovery policies info = serve_state.get_replica_info_from_id(self._service_name, replica_id) diff --git a/sky/serve/serve_state.py b/sky/serve/serve_state.py index aad8f764168..f33351e02e0 100644 --- a/sky/serve/serve_state.py +++ b/sky/serve/serve_state.py @@ -108,9 +108,9 @@ def colored_str(self) -> str: ReplicaStatus.STARTING: colorama.Fore.CYAN, ReplicaStatus.READY: colorama.Fore.GREEN, ReplicaStatus.NOT_READY: colorama.Fore.YELLOW, - ReplicaStatus.FAILED_CLEANUP: colorama.Fore.RED, ReplicaStatus.SHUTTING_DOWN: colorama.Fore.MAGENTA, ReplicaStatus.FAILED: colorama.Fore.RED, + ReplicaStatus.FAILED_CLEANUP: colorama.Fore.RED, ReplicaStatus.PREEMPTED: colorama.Fore.MAGENTA, ReplicaStatus.UNKNOWN: colorama.Fore.RED, } @@ -141,9 +141,6 @@ class ServiceStatus(enum.Enum): # Clean up failed FAILED_CLEANUP = 'FAILED_CLEANUP' - # Max service number is reached and the service is pending - PENDING = 'PENDING' - @classmethod def failed_statuses(cls) -> List['ServiceStatus']: return [cls.CONTROLLER_FAILED, cls.FAILED_CLEANUP] @@ -175,7 +172,6 @@ def from_replica_statuses( ServiceStatus.CONTROLLER_FAILED: colorama.Fore.RED, ServiceStatus.READY: colorama.Fore.GREEN, ServiceStatus.SHUTTING_DOWN: colorama.Fore.YELLOW, - ServiceStatus.PENDING: colorama.Fore.YELLOW, ServiceStatus.FAILED: colorama.Fore.RED, ServiceStatus.FAILED_CLEANUP: colorama.Fore.RED, } diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index b9a91447ebc..7ad84f05de7 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -25,6 +25,7 @@ from sky.serve import serve_state from sky.skylet import job_lib from sky.utils import common_utils +from sky.utils import ux_utils if typing.TYPE_CHECKING: import fastapi @@ -356,20 +357,32 @@ def wait_service_initialization(service_name: str, job_id: int) -> str: cnt = 0 while True: record = serve_state.get_service_from_name(service_name) - if record is None: - continue - if job_id != record['controller_job_id']: - return common_utils.encode_payload(None) - lb_port = record['load_balancer_port'] - if lb_port is not None: - return common_utils.encode_payload(lb_port) + if record is not None: + if job_id != record['controller_job_id']: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'The service {service_name!r} is already running. ' + 'Please specify a different name for your service. ' + 'To update an existing service, run: `sky serve down` ' + 'and then `sky serve up` again (in-place update will ' + 'be supported in the future).') + lb_port = record['load_balancer_port'] + if lb_port is not None: + return common_utils.encode_payload(lb_port) + elif len(serve_state.get_services()) >= NUM_SERVICE_THRESHOLD: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError('Max number of services reached. ' + 'To spin up more services, please ' + 'tear down some existing services.') time.sleep(1) cnt += 1 if cnt > constants.INITIALIZATION_TIMEOUT_SECONDS: - raise ValueError(f'Failed to initialize service {service_name!r}.') + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Initialization of service {service_name!r} timeout.') -def load_service_initialization_result(payload: str) -> Optional[int]: +def load_service_initialization_result(payload: str) -> int: return common_utils.decode_payload(payload) diff --git a/sky/serve/service.py b/sky/serve/service.py index 617b73299c0..d81e1364037 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -136,17 +136,17 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int): if isinstance(config, dict): resources_config = config.get('resources') requested_resources = resources.Resources.from_yaml_config(resources_config) - status = serve_state.ServiceStatus.CONTROLLER_INIT if len(serve_state.get_services()) >= serve_utils.NUM_SERVICE_THRESHOLD: - # TODO(tian): Probably we should raise an error and not pending here. - # This busy loop is also a ray job and will take a lot of memory. - status = serve_state.ServiceStatus.PENDING - success = serve_state.add_service(service_name, - controller_job_id=job_id, - policy=service_spec.policy_str(), - auto_restart=service_spec.auto_restart, - requested_resources=requested_resources, - status=status) + _cleanup_storage(tmp_task_yaml) + with ux_utils.print_exception_no_traceback(): + raise RuntimeError('Max number of services reached.') + success = serve_state.add_service( + service_name, + controller_job_id=job_id, + policy=service_spec.policy_str(), + auto_restart=service_spec.auto_restart, + requested_resources=requested_resources, + status=serve_state.ServiceStatus.CONTROLLER_INIT) # Directly throw an error here. See sky/execution.py::serve_up # for more details. if not success: @@ -175,17 +175,6 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int): controller_process = None load_balancer_process = None try: - # Wait until there is a service slot available. - while True: - _handle_signal(service_name) - # Use <= here since we already add this service to database. - if (len(serve_state.get_services()) <= - serve_utils.NUM_SERVICE_THRESHOLD): - serve_state.set_service_status( - service_name, serve_state.ServiceStatus.CONTROLLER_INIT) - break - time.sleep(1) - with filelock.FileLock( os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)): controller_port = common_utils.find_free_port( diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index 5648c08516c..0c3378afc56 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -24,7 +24,7 @@ run: | --service-name {{service_name}} \ --task-yaml {{remote_task_yaml_path}} \ --job-id $SKYPILOT_INTERNAL_JOB_ID \ - > {{controller_log_file}} 2>&1 + >> {{controller_log_file}} 2>&1 envs: {%- for env_name, env_value in controller_envs.items() %} From 9b812d29190022101162e4923cf283e3d29b7a67 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 12 Nov 2023 15:19:50 -0800 Subject: [PATCH 208/223] stress test passed --- sky/execution.py | 2 +- sky/serve/constants.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index f1f9b0298f1..a09e4dc460b 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -1184,7 +1184,7 @@ def serve_up( sky_logging.print( f'{fore.CYAN}Service name: ' f'{style.BRIGHT}{service_name}{style.RESET_ALL}' - f'{fore.CYAN}Endpoint URL: ' + f'\n{fore.CYAN}Endpoint URL: ' f'{style.BRIGHT}{endpoint}{style.RESET_ALL}' '\nTo see detailed info:\t\t' f'{backend_utils.BOLD}sky serve status {service_name} ' diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 1b010b7f590..a3296b9e20f 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -48,7 +48,6 @@ CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200} # A default controller with 4 vCPU and 16 GB memory can run up to 16 services. -# TODO(tian): Stress test for 16 services on default controller resources. SERVICES_MEMORY_USAGE_GB = 1.0 SERVICES_TASK_CPU_DEMAND = 0.25 From 78ac1556ecd1a11538c44ca1f57d0f314606e275 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 13 Nov 2023 13:40:25 -0800 Subject: [PATCH 209/223] nits --- sky/serve/serve_utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 7ad84f05de7..34cf641c1eb 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -353,8 +353,11 @@ def wait_service_initialization(service_name: str, job_id: int) -> str: means another service is already taken that name. See sky/execution.py::serve_up for more details. (2) Wait for the load balancer port to be assigned and return. + + Returns: + Encoded load balancer port assigned to the service. """ - cnt = 0 + start_time = time.time() while True: record = serve_state.get_service_from_name(service_name) if record is not None: @@ -374,12 +377,11 @@ def wait_service_initialization(service_name: str, job_id: int) -> str: raise RuntimeError('Max number of services reached. ' 'To spin up more services, please ' 'tear down some existing services.') - time.sleep(1) - cnt += 1 - if cnt > constants.INITIALIZATION_TIMEOUT_SECONDS: + if time.time() - start_time > constants.INITIALIZATION_TIMEOUT_SECONDS: with ux_utils.print_exception_no_traceback(): raise ValueError( f'Initialization of service {service_name!r} timeout.') + time.sleep(1) def load_service_initialization_result(payload: str) -> int: From 7a25afaffc002d79f99212589cfd9c5950207bed Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 13 Nov 2023 15:57:37 -0800 Subject: [PATCH 210/223] fix examples --- examples/serve/gorilla/gorilla.yaml | 4 ++-- examples/serve/gorilla/use_gorilla.ipynb | 6 +++--- examples/serve/llama2/llama2.yaml | 2 +- examples/serve/vicuna-v1.5.yaml | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/serve/gorilla/gorilla.yaml b/examples/serve/gorilla/gorilla.yaml index 8e01c713932..5a1239c5fb1 100644 --- a/examples/serve/gorilla/gorilla.yaml +++ b/examples/serve/gorilla/gorilla.yaml @@ -39,11 +39,11 @@ run: | sleep 10 echo 'Starting model worker...' python -u -m fastchat.serve.model_worker \ - --model-path gorilla-llm/gorilla-mpt-7b-hf-v0 2>&1 \ + --model-path gorilla-llm/gorilla-falcon-7b-hf-v0 2>&1 \ | tee model_worker.log & echo 'Waiting for model worker to start...' while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done - echo 'Starting openai api server server...' + echo 'Starting openai api server...' python -u -m fastchat.serve.openai_api_server --host 0.0.0.0 --port 8087 | tee ~/openai_api_server.log diff --git a/examples/serve/gorilla/use_gorilla.ipynb b/examples/serve/gorilla/use_gorilla.ipynb index a7e37530d82..632ec2cd224 100644 --- a/examples/serve/gorilla/use_gorilla.ipynb +++ b/examples/serve/gorilla/use_gorilla.ipynb @@ -50,7 +50,7 @@ " print(f\"An exception has occurred: {e} \\nPlease raise an issue here: {issue_url}\")\n", "\n", "# Query Gorilla server\n", - "def get_gorilla_response(prompt=\"I would like to translate from English to French.\", model=\"gorilla-7b-hf-v1\"):\n", + "def get_gorilla_response(prompt=\"I would like to translate from English to French.\", model=\"gorilla-falcon-7b-hf-v0\"):\n", " try:\n", " completion = openai.ChatCompletion.create(\n", " model=model,\n", @@ -70,7 +70,7 @@ "# Gorilla `gorilla-mpt-7b-hf-v1` with code snippets\n", "# Translation\n", "prompt = \"I would like to translate 'I feel very good today.' from English to Chinese.\"\n", - "print(get_gorilla_response(prompt, model=\"gorilla-7b-hf-v1\"))" + "print(get_gorilla_response(prompt, model=\"gorilla-falcon-7b-hf-v0\"))" ] }, { @@ -82,7 +82,7 @@ "# Gorilla `gorilla-7b-hf-v1` with code snippets\n", "# Object Detection\n", "prompt = \"I want to build a robot that can detecting objects in an image ‘cat.jpeg’. Input: [‘cat.jpeg’]\"\n", - "print(get_gorilla_response(prompt, model=\"gorilla-7b-hf-v1\"))" + "print(get_gorilla_response(prompt, model=\"gorilla-falcon-7b-hf-v0\"))" ] } ], diff --git a/examples/serve/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml index 78f0dc3a811..e318b25d5ac 100644 --- a/examples/serve/llama2/llama2.yaml +++ b/examples/serve/llama2/llama2.yaml @@ -53,5 +53,5 @@ run: | echo 'Waiting for model worker to start...' while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done - echo 'Starting openai api server server...' + echo 'Starting openai api server...' python -u -m fastchat.serve.openai_api_server --host 0.0.0.0 --port 8087 | tee ~/openai_api_server.log diff --git a/examples/serve/vicuna-v1.5.yaml b/examples/serve/vicuna-v1.5.yaml index 689c38b9263..f53aebde747 100644 --- a/examples/serve/vicuna-v1.5.yaml +++ b/examples/serve/vicuna-v1.5.yaml @@ -44,5 +44,5 @@ run: | echo 'Waiting for model worker to start...' while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done - echo 'Starting openai api server server...' + echo 'Starting openai api server...' python -u -m fastchat.serve.openai_api_server --host 0.0.0.0 --port 8087 | tee ~/openai_api_server.log From 8352affea654ff3617b1a552f1eeba53e1690e82 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 13 Nov 2023 16:42:31 -0800 Subject: [PATCH 211/223] refactor: moving some funcs in execution.py to controller_utils --- sky/execution.py | 339 ++++------------------------------ sky/utils/controller_utils.py | 293 ++++++++++++++++++++++++++++- 2 files changed, 327 insertions(+), 305 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index a09e4dc460b..a5965bf9f75 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -4,11 +4,10 @@ """ import copy import enum -import getpass import os import re import tempfile -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import uuid import colorama @@ -21,13 +20,10 @@ from sky import optimizer from sky import serve from sky import sky_logging -from sky import skypilot_config from sky import spot from sky import task as task_lib from sky.backends import backend_utils from sky.clouds import gcp -from sky.data import data_utils -from sky.data import storage as storage_lib from sky.skylet import constants from sky.usage import usage_lib from sky.utils import common_utils @@ -63,14 +59,6 @@ sky.spot_launch(task, ...) """.strip() -# Message thrown when APIs sky.{spot_launch,serve_up}() received an invalid -# controller resources spec. -_CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = ( - '{controller_type} controller resources is not valid, please check ' - '~/.sky/config.yaml file and make sure ' - '{controller_type}.controller.resources is a valid resources spec. ' - 'Details:\n {err}') - def _convert_to_dag(entrypoint: Any) -> 'sky.Dag': """Convert the entrypoint to a sky.Dag. @@ -627,115 +615,6 @@ def exec( # pylint: disable=redefined-builtin ) -def _shared_controller_env_vars() -> Dict[str, str]: - env_vars: Dict[str, str] = { - env.value: '1' for env in env_options.Options if env.get() - } - env_vars.update({ - # Should not use $USER here, as that env var can be empty when - # running in a container. - constants.USER_ENV_VAR: getpass.getuser(), - constants.USER_ID_ENV_VAR: common_utils.get_user_hash(), - # Skip cloud identity check to avoid the overhead. - env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: '1', - }) - return env_vars - - -def _controller_skypilot_config_setup( - controller_type: str, - controller_resources_config: Dict[str, Any], - remote_user_config_path: str, -) -> Tuple[Dict[str, Any], 'sky.Resources']: - """Read the skypilot config and setup the controller resources. - - Returns: - A tuple of (vars_to_fill, controller_resources). - """ - vars_to_fill: Dict[str, Any] = {} - controller_envs = _shared_controller_env_vars() - controller_resources_config_copied: Dict[str, Any] = copy.copy( - controller_resources_config) - if skypilot_config.loaded(): - # Look up the contents of the already loaded configs via the - # 'skypilot_config' module. Don't simply read the on-disk file as - # it may have changed since this process started. - # - # Set any proxy command to None, because the controller would've - # been launched behind the proxy, and in general any nodes we - # launch may not have or need the proxy setup. (If the controller - # needs to launch mew clusters in another region/VPC, the user - # should properly set up VPC peering, which will allow the - # cross-region/VPC communication. The proxy command is orthogonal - # to this scenario.) - # - # This file will be uploaded to the controller node and will be - # used throughout the spot job's / service's recovery attempts - # (i.e., if it relaunches due to preemption, we make sure the - # same config is used). - # - # NOTE: suppose that we have a controller in old VPC, then user - # changes 'vpc_name' in the config and does a 'spot launch' / - # 'serve up'. In general, the old controller may not successfully - # launch the job in the new VPC. This happens if the two VPCs don’t - # have peering set up. Like other places in the code, we assume - # properly setting up networking is user's responsibilities. - # TODO(zongheng): consider adding a basic check that checks - # controller VPC (or name) == the spot job's / service's VPC - # (or name). It may not be a sufficient check (as it's always - # possible that peering is not set up), but it may catch some - # obvious errors. - # TODO(zhwu): hacky. We should only set the proxy command of the - # cloud where the controller is launched (currently, only aws user - # uses proxy_command). - proxy_command_key = ('aws', 'ssh_proxy_command') - ssh_proxy_command = skypilot_config.get_nested(proxy_command_key, None) - config_dict = skypilot_config.to_dict() - if isinstance(ssh_proxy_command, str): - config_dict = skypilot_config.set_nested(proxy_command_key, None) - elif isinstance(ssh_proxy_command, dict): - # Instead of removing the key, we set the value to empty string - # so that the controller will only try the regions specified by - # the keys. - ssh_proxy_command = {k: None for k in ssh_proxy_command} - config_dict = skypilot_config.set_nested(proxy_command_key, - ssh_proxy_command) - - with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmpfile: - common_utils.dump_yaml(tmpfile.name, config_dict) - controller_envs[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = ( - remote_user_config_path) - vars_to_fill.update({ - 'user_config_path': tmpfile.name, - 'remote_user_config_path': remote_user_config_path, - }) - - # Override the controller resources with the ones specified in the - # config. - custom_controller_resources_config = skypilot_config.get_nested( - (controller_type, 'controller', 'resources'), None) - if custom_controller_resources_config is not None: - controller_resources_config_copied.update( - custom_controller_resources_config) - else: - # If the user config is not loaded, manually set this to None - # so that the template won't render this. - vars_to_fill['user_config_path'] = None - try: - controller_resources = sky.Resources.from_yaml_config( - controller_resources_config_copied) - except ValueError as e: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - _CONTROLLER_RESOURCES_NOT_VALID_MESSAGE.format( - controller_type=controller_type, - err=common_utils.format_exception(e, - use_bracket=True))) from e - - vars_to_fill['controller_envs'] = controller_envs - return vars_to_fill, controller_resources - - @usage_lib.entrypoint def spot_launch( task: Union['sky.Task', 'sky.Dag'], @@ -781,16 +660,28 @@ def spot_launch( dag_utils.fill_default_spot_config_in_dag_for_spot_launch(dag) for task_ in dag.tasks: - _maybe_translate_local_file_mounts_and_sync_up(task_, path='spot') + controller_utils.maybe_translate_local_file_mounts_and_sync_up( + task_, path='spot') with tempfile.NamedTemporaryFile(prefix=f'spot-dag-{dag.name}-', mode='w') as f: dag_utils.dump_chain_dag_to_yaml(dag, f.name) controller_name = spot.SPOT_CONTROLLER_NAME - extra_vars, controller_resources = _controller_skypilot_config_setup( - controller_type='spot', - controller_resources_config=spot.constants.CONTROLLER_RESOURCES, - remote_user_config_path=f'{dag.name}-{dag_uuid}.config_yaml') + extra_vars, controller_resources_config = ( + controller_utils.skypilot_config_setup( + controller_type='spot', + controller_resources_config=spot.constants.CONTROLLER_RESOURCES, + remote_user_config_path=f'{dag.name}-{dag_uuid}.config_yaml')) + try: + controller_resources = sky.Resources.from_yaml_config( + controller_resources_config) + except ValueError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + controller_utils.CONTROLLER_RESOURCES_NOT_VALID_MESSAGE. + format(controller_type='spot', + err=common_utils.format_exception( + e, use_bracket=True))) from e vars_to_fill = { 'remote_user_yaml_prefix': spot.SPOT_TASK_YAML_PREFIX, 'user_yaml_path': f.name, @@ -837,178 +728,6 @@ def spot_launch( ) -def _maybe_translate_local_file_mounts_and_sync_up(task: task_lib.Task, - path: str): - """Translates local->VM mounts into Storage->VM, then syncs up any Storage. - - Eagerly syncing up local->Storage ensures Storage->VM would work at task - launch time. - - If there are no local source paths to be translated, this function would - still sync up any storage mounts with local source paths (which do not - undergo translation). - """ - # ================================================================ - # Translate the workdir and local file mounts to cloud file mounts. - # ================================================================ - run_id = common_utils.get_usage_run_id()[:8] - original_file_mounts = task.file_mounts if task.file_mounts else {} - original_storage_mounts = task.storage_mounts if task.storage_mounts else {} - - copy_mounts = task.get_local_to_remote_file_mounts() - if copy_mounts is None: - copy_mounts = {} - - has_local_source_paths_file_mounts = bool(copy_mounts) - has_local_source_paths_workdir = task.workdir is not None - - msg = None - if has_local_source_paths_workdir and has_local_source_paths_file_mounts: - msg = 'workdir and file_mounts with local source paths' - elif has_local_source_paths_file_mounts: - msg = 'file_mounts with local source paths' - elif has_local_source_paths_workdir: - msg = 'workdir' - if msg: - logger.info(f'{colorama.Fore.YELLOW}Translating {msg} to SkyPilot ' - f'Storage...{colorama.Style.RESET_ALL}') - - # Step 1: Translate the workdir to SkyPilot storage. - new_storage_mounts = {} - if task.workdir is not None: - bucket_name = constants.WORKDIR_BUCKET_NAME.format( - username=getpass.getuser(), id=run_id) - workdir = task.workdir - task.workdir = None - if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or - constants.SKY_REMOTE_WORKDIR in original_storage_mounts): - raise ValueError( - f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the ' - 'workdir and file_mounts contains it as the target.') - new_storage_mounts[ - constants. - SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config({ - 'name': bucket_name, - 'source': workdir, - 'persistent': False, - 'mode': 'COPY', - }) - # Check of the existence of the workdir in file_mounts is done in - # the task construction. - logger.info(f'Workdir {workdir!r} will be synced to cloud storage ' - f'{bucket_name!r}.') - - # Step 2: Translate the local file mounts with folder in src to SkyPilot - # storage. - # TODO(zhwu): Optimize this by: - # 1. Use the same bucket for all the mounts. - # 2. When the src is the same, use the same bucket. - copy_mounts_with_file_in_src = {} - for i, (dst, src) in enumerate(copy_mounts.items()): - assert task.file_mounts is not None - task.file_mounts.pop(dst) - if os.path.isfile(os.path.abspath(os.path.expanduser(src))): - copy_mounts_with_file_in_src[dst] = src - continue - bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( - username=getpass.getuser(), - id=f'{run_id}-{i}', - ) - new_storage_mounts[dst] = storage_lib.Storage.from_yaml_config({ - 'name': bucket_name, - 'source': src, - 'persistent': False, - 'mode': 'COPY', - }) - logger.info( - f'Folder in local file mount {src!r} will be synced to SkyPilot ' - f'storage {bucket_name}.') - - # Step 3: Translate local file mounts with file in src to SkyPilot storage. - # Hard link the files in src to a temporary directory, and upload folder. - local_fm_path = os.path.join( - tempfile.gettempdir(), - constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)) - os.makedirs(local_fm_path, exist_ok=True) - file_bucket_name = constants.FILE_MOUNTS_FILE_ONLY_BUCKET_NAME.format( - username=getpass.getuser(), id=run_id) - file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format( - path) - if copy_mounts_with_file_in_src: - src_to_file_id = {} - for i, src in enumerate(set(copy_mounts_with_file_in_src.values())): - src_to_file_id[src] = i - os.link(os.path.abspath(os.path.expanduser(src)), - os.path.join(local_fm_path, f'file-{i}')) - - new_storage_mounts[ - file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config({ - 'name': file_bucket_name, - 'source': local_fm_path, - 'persistent': False, - 'mode': 'MOUNT', - }) - if file_mount_remote_tmp_dir in original_storage_mounts: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Failed to translate file mounts, due to the default ' - f'destination {file_mount_remote_tmp_dir} ' - 'being taken.') - sources = list(src_to_file_id.keys()) - sources_str = '\n\t'.join(sources) - logger.info('Source files in file_mounts will be synced to ' - f'cloud storage {file_bucket_name}:' - f'\n\t{sources_str}') - task.update_storage_mounts(new_storage_mounts) - - # Step 4: Upload storage from sources - # Upload the local source to a bucket. The task will not be executed - # locally, so we need to upload the files/folders to the bucket manually - # here before sending the task to the remote spot controller. - if task.storage_mounts: - # There may be existing (non-translated) storage mounts, so log this - # whenever task.storage_mounts is non-empty. - logger.info(f'{colorama.Fore.YELLOW}Uploading sources to cloud storage.' - f'{colorama.Style.RESET_ALL} See: sky storage ls') - task.sync_storage_mounts() - - # Step 5: Add the file download into the file mounts, such as - # /original-dst: s3://spot-fm-file-only-bucket-name/file-0 - new_file_mounts = {} - for dst, src in copy_mounts_with_file_in_src.items(): - storage = task.storage_mounts[file_mount_remote_tmp_dir] - store_type = list(storage.stores.keys())[0] - store_prefix = storage_lib.get_store_prefix(store_type) - bucket_url = store_prefix + file_bucket_name - file_id = src_to_file_id[src] - new_file_mounts[dst] = bucket_url + f'/file-{file_id}' - task.update_file_mounts(new_file_mounts) - - # Step 6: Replace the source field that is local path in all storage_mounts - # with bucket URI and remove the name field. - for storage_obj in task.storage_mounts.values(): - if (storage_obj.source is not None and - not data_utils.is_cloud_store_url(storage_obj.source)): - # Need to replace the local path with bucket URI, and remove the - # name field, so that the storage mount can work on the spot - # controller. - store_types = list(storage_obj.stores.keys()) - assert len(store_types) == 1, ( - 'We only support one store type for now.', storage_obj.stores) - store_type = store_types[0] - if store_type == storage_lib.StoreType.S3: - storage_obj.source = f's3://{storage_obj.name}' - elif store_type == storage_lib.StoreType.GCS: - storage_obj.source = f'gs://{storage_obj.name}' - elif store_type == storage_lib.StoreType.R2: - storage_obj.source = f'r2://{storage_obj.name}' - else: - with ux_utils.print_exception_no_traceback(): - raise exceptions.NotSupportedError( - f'Unsupported store type: {store_type}') - storage_obj.force_delete = True - - @usage_lib.entrypoint def serve_up( task: 'sky.Task', @@ -1048,7 +767,8 @@ def serve_up( 'Must only specify one port in resources. Each replica ' 'will use the port specified as application ingress port.') - _maybe_translate_local_file_mounts_and_sync_up(task, path='serve') + controller_utils.maybe_translate_local_file_mounts_and_sync_up(task, + path='serve') with tempfile.NamedTemporaryFile( prefix=f'service-task-{service_name}-', @@ -1066,10 +786,21 @@ def serve_up( serve.generate_remote_config_yaml_file_name(service_name)) controller_log_file = ( serve.generate_remote_controller_log_file_name(service_name)) - extra_vars, controller_resources = _controller_skypilot_config_setup( - controller_type='serve', - controller_resources_config=serve.CONTROLLER_RESOURCES, - remote_user_config_path=remote_config_yaml_path) + extra_vars, controller_resources_config = ( + controller_utils.skypilot_config_setup( + controller_type='serve', + controller_resources_config=serve.CONTROLLER_RESOURCES, + remote_user_config_path=remote_config_yaml_path)) + try: + controller_resources = sky.Resources.from_yaml_config( + controller_resources_config) + except ValueError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + controller_utils.CONTROLLER_RESOURCES_NOT_VALID_MESSAGE. + format(controller_type='serve', + err=common_utils.format_exception( + e, use_bracket=True))) from e vars_to_fill = { 'remote_task_yaml_path': remote_tmp_task_yaml_path, 'local_task_yaml_path': service_file.name, diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index a8667e7f72b..a3278365b95 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -1,20 +1,29 @@ """Util constants/functions for SkyPilot Controllers.""" +import copy import dataclasses import enum +import getpass import os +import tempfile import typing -from typing import Optional +from typing import Any, Dict, Optional, Tuple import colorama from sky import exceptions from sky import serve as serve_lib from sky import sky_logging +from sky import skypilot_config from sky import spot as spot_lib +from sky.data import data_utils +from sky.data import storage as storage_lib +from sky.skylet import constants from sky.utils import common_utils +from sky.utils import env_options from sky.utils import ux_utils if typing.TYPE_CHECKING: + from sky import task as task_lib from sky.backends import cloud_vm_ray_backend logger = sky_logging.init_logger(__name__) @@ -23,6 +32,14 @@ # controller and sky serve controller. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 +# Message thrown when APIs sky.{spot_launch,serve_up}() received an invalid +# controller resources spec. +CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = ( + '{controller_type} controller resources is not valid, please check ' + '~/.sky/config.yaml file and make sure ' + '{controller_type}.controller.resources is a valid resources spec. ' + 'Details:\n {err}') + @dataclasses.dataclass class _ControllerSpec: @@ -171,3 +188,277 @@ def download_and_stream_latest_job_log( logger.error('Failed to find the logs for the user ' f'program at {log_file}.') return log_file + + +def _shared_controller_env_vars() -> Dict[str, str]: + env_vars: Dict[str, str] = { + env.value: '1' for env in env_options.Options if env.get() + } + env_vars.update({ + # Should not use $USER here, as that env var can be empty when + # running in a container. + constants.USER_ENV_VAR: getpass.getuser(), + constants.USER_ID_ENV_VAR: common_utils.get_user_hash(), + # Skip cloud identity check to avoid the overhead. + env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: '1', + }) + return env_vars + + +def skypilot_config_setup( + controller_type: str, + controller_resources_config: Dict[str, Any], + remote_user_config_path: str, +) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """Read the skypilot config and setup the controller resources. + + Returns: + A tuple of (vars_to_fill, controller_resources_config). `var_to_fill` + is a dict of variables that will be filled in the controller template. + The controller_resources_config is the resources config that will be + used to launch the controller. + """ + vars_to_fill: Dict[str, Any] = {} + controller_envs = _shared_controller_env_vars() + controller_resources_config_copied: Dict[str, Any] = copy.copy( + controller_resources_config) + if skypilot_config.loaded(): + # Look up the contents of the already loaded configs via the + # 'skypilot_config' module. Don't simply read the on-disk file as + # it may have changed since this process started. + # + # Set any proxy command to None, because the controller would've + # been launched behind the proxy, and in general any nodes we + # launch may not have or need the proxy setup. (If the controller + # needs to launch mew clusters in another region/VPC, the user + # should properly set up VPC peering, which will allow the + # cross-region/VPC communication. The proxy command is orthogonal + # to this scenario.) + # + # This file will be uploaded to the controller node and will be + # used throughout the spot job's / service's recovery attempts + # (i.e., if it relaunches due to preemption, we make sure the + # same config is used). + # + # NOTE: suppose that we have a controller in old VPC, then user + # changes 'vpc_name' in the config and does a 'spot launch' / + # 'serve up'. In general, the old controller may not successfully + # launch the job in the new VPC. This happens if the two VPCs don’t + # have peering set up. Like other places in the code, we assume + # properly setting up networking is user's responsibilities. + # TODO(zongheng): consider adding a basic check that checks + # controller VPC (or name) == the spot job's / service's VPC + # (or name). It may not be a sufficient check (as it's always + # possible that peering is not set up), but it may catch some + # obvious errors. + # TODO(zhwu): hacky. We should only set the proxy command of the + # cloud where the controller is launched (currently, only aws user + # uses proxy_command). + proxy_command_key = ('aws', 'ssh_proxy_command') + ssh_proxy_command = skypilot_config.get_nested(proxy_command_key, None) + config_dict = skypilot_config.to_dict() + if isinstance(ssh_proxy_command, str): + config_dict = skypilot_config.set_nested(proxy_command_key, None) + elif isinstance(ssh_proxy_command, dict): + # Instead of removing the key, we set the value to empty string + # so that the controller will only try the regions specified by + # the keys. + ssh_proxy_command = {k: None for k in ssh_proxy_command} + config_dict = skypilot_config.set_nested(proxy_command_key, + ssh_proxy_command) + + with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmpfile: + common_utils.dump_yaml(tmpfile.name, config_dict) + controller_envs[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = ( + remote_user_config_path) + vars_to_fill.update({ + 'user_config_path': tmpfile.name, + 'remote_user_config_path': remote_user_config_path, + }) + + # Override the controller resources with the ones specified in the + # config. + custom_controller_resources_config = skypilot_config.get_nested( + (controller_type, 'controller', 'resources'), None) + if custom_controller_resources_config is not None: + controller_resources_config_copied.update( + custom_controller_resources_config) + else: + # If the user config is not loaded, manually set this to None + # so that the template won't render this. + vars_to_fill['user_config_path'] = None + + vars_to_fill['controller_envs'] = controller_envs + return vars_to_fill, controller_resources_config_copied + + +def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', + path: str): + """Translates local->VM mounts into Storage->VM, then syncs up any Storage. + + Eagerly syncing up local->Storage ensures Storage->VM would work at task + launch time. + + If there are no local source paths to be translated, this function would + still sync up any storage mounts with local source paths (which do not + undergo translation). + """ + # ================================================================ + # Translate the workdir and local file mounts to cloud file mounts. + # ================================================================ + run_id = common_utils.get_usage_run_id()[:8] + original_file_mounts = task.file_mounts if task.file_mounts else {} + original_storage_mounts = task.storage_mounts if task.storage_mounts else {} + + copy_mounts = task.get_local_to_remote_file_mounts() + if copy_mounts is None: + copy_mounts = {} + + has_local_source_paths_file_mounts = bool(copy_mounts) + has_local_source_paths_workdir = task.workdir is not None + + msg = None + if has_local_source_paths_workdir and has_local_source_paths_file_mounts: + msg = 'workdir and file_mounts with local source paths' + elif has_local_source_paths_file_mounts: + msg = 'file_mounts with local source paths' + elif has_local_source_paths_workdir: + msg = 'workdir' + if msg: + logger.info(f'{colorama.Fore.YELLOW}Translating {msg} to SkyPilot ' + f'Storage...{colorama.Style.RESET_ALL}') + + # Step 1: Translate the workdir to SkyPilot storage. + new_storage_mounts = {} + if task.workdir is not None: + bucket_name = constants.WORKDIR_BUCKET_NAME.format( + username=getpass.getuser(), id=run_id) + workdir = task.workdir + task.workdir = None + if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or + constants.SKY_REMOTE_WORKDIR in original_storage_mounts): + raise ValueError( + f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the ' + 'workdir and file_mounts contains it as the target.') + new_storage_mounts[ + constants. + SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config({ + 'name': bucket_name, + 'source': workdir, + 'persistent': False, + 'mode': 'COPY', + }) + # Check of the existence of the workdir in file_mounts is done in + # the task construction. + logger.info(f'Workdir {workdir!r} will be synced to cloud storage ' + f'{bucket_name!r}.') + + # Step 2: Translate the local file mounts with folder in src to SkyPilot + # storage. + # TODO(zhwu): Optimize this by: + # 1. Use the same bucket for all the mounts. + # 2. When the src is the same, use the same bucket. + copy_mounts_with_file_in_src = {} + for i, (dst, src) in enumerate(copy_mounts.items()): + assert task.file_mounts is not None + task.file_mounts.pop(dst) + if os.path.isfile(os.path.abspath(os.path.expanduser(src))): + copy_mounts_with_file_in_src[dst] = src + continue + bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format( + username=getpass.getuser(), + id=f'{run_id}-{i}', + ) + new_storage_mounts[dst] = storage_lib.Storage.from_yaml_config({ + 'name': bucket_name, + 'source': src, + 'persistent': False, + 'mode': 'COPY', + }) + logger.info( + f'Folder in local file mount {src!r} will be synced to SkyPilot ' + f'storage {bucket_name}.') + + # Step 3: Translate local file mounts with file in src to SkyPilot storage. + # Hard link the files in src to a temporary directory, and upload folder. + local_fm_path = os.path.join( + tempfile.gettempdir(), + constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id)) + os.makedirs(local_fm_path, exist_ok=True) + file_bucket_name = constants.FILE_MOUNTS_FILE_ONLY_BUCKET_NAME.format( + username=getpass.getuser(), id=run_id) + file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format( + path) + if copy_mounts_with_file_in_src: + src_to_file_id = {} + for i, src in enumerate(set(copy_mounts_with_file_in_src.values())): + src_to_file_id[src] = i + os.link(os.path.abspath(os.path.expanduser(src)), + os.path.join(local_fm_path, f'file-{i}')) + + new_storage_mounts[ + file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config({ + 'name': file_bucket_name, + 'source': local_fm_path, + 'persistent': False, + 'mode': 'MOUNT', + }) + if file_mount_remote_tmp_dir in original_storage_mounts: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Failed to translate file mounts, due to the default ' + f'destination {file_mount_remote_tmp_dir} ' + 'being taken.') + sources = list(src_to_file_id.keys()) + sources_str = '\n\t'.join(sources) + logger.info('Source files in file_mounts will be synced to ' + f'cloud storage {file_bucket_name}:' + f'\n\t{sources_str}') + task.update_storage_mounts(new_storage_mounts) + + # Step 4: Upload storage from sources + # Upload the local source to a bucket. The task will not be executed + # locally, so we need to upload the files/folders to the bucket manually + # here before sending the task to the remote spot controller. + if task.storage_mounts: + # There may be existing (non-translated) storage mounts, so log this + # whenever task.storage_mounts is non-empty. + logger.info(f'{colorama.Fore.YELLOW}Uploading sources to cloud storage.' + f'{colorama.Style.RESET_ALL} See: sky storage ls') + task.sync_storage_mounts() + + # Step 5: Add the file download into the file mounts, such as + # /original-dst: s3://spot-fm-file-only-bucket-name/file-0 + new_file_mounts = {} + for dst, src in copy_mounts_with_file_in_src.items(): + storage = task.storage_mounts[file_mount_remote_tmp_dir] + store_type = list(storage.stores.keys())[0] + store_prefix = storage_lib.get_store_prefix(store_type) + bucket_url = store_prefix + file_bucket_name + file_id = src_to_file_id[src] + new_file_mounts[dst] = bucket_url + f'/file-{file_id}' + task.update_file_mounts(new_file_mounts) + + # Step 6: Replace the source field that is local path in all storage_mounts + # with bucket URI and remove the name field. + for storage_obj in task.storage_mounts.values(): + if (storage_obj.source is not None and + not data_utils.is_cloud_store_url(storage_obj.source)): + # Need to replace the local path with bucket URI, and remove the + # name field, so that the storage mount can work on the spot + # controller. + store_types = list(storage_obj.stores.keys()) + assert len(store_types) == 1, ( + 'We only support one store type for now.', storage_obj.stores) + store_type = store_types[0] + if store_type == storage_lib.StoreType.S3: + storage_obj.source = f's3://{storage_obj.name}' + elif store_type == storage_lib.StoreType.GCS: + storage_obj.source = f'gs://{storage_obj.name}' + elif store_type == storage_lib.StoreType.R2: + storage_obj.source = f'r2://{storage_obj.name}' + else: + with ux_utils.print_exception_no_traceback(): + raise exceptions.NotSupportedError( + f'Unsupported store type: {store_type}') + storage_obj.force_delete = True From 0c8b734e92b9fd61522bf6cadad18b52a6505cee Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 13 Nov 2023 21:32:55 -0800 Subject: [PATCH 212/223] smoke test passed --- sky/backends/cloud_vm_ray_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index f74648669e9..fdb41181b1f 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4917,7 +4917,7 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, env_vars=task.envs, task_name=task.name, job_run_id=job_run_id, - ray_resources_dict=resources_dict, + ray_resources_dict=backend_utils.get_task_demands_dict(task), log_dir=log_dir, gang_scheduling_id=i, use_sudo=use_sudo, From 9c67cd088b9ddba707c82976b2f8bb8c85f900a2 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 13 Nov 2023 22:19:08 -0800 Subject: [PATCH 213/223] remove --target & minor --- sky/cli.py | 42 ++++------------------------ sky/serve/load_balancing_policies.py | 10 ++++--- sky/serve/replica_managers.py | 2 +- sky/serve/serve_utils.py | 7 +---- 4 files changed, 13 insertions(+), 48 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 00d016fe8c2..556d9c49c3d 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4243,7 +4243,6 @@ def serve_up( sky serve up service.yaml """ - # TODO(tian): Update the example after we move the ports to resources. if service_name is None: service_name = serve_lib.generate_service_name() @@ -4441,7 +4440,6 @@ def serve_down(service_names: List[str], all: bool, yes: bool): \b # Tear down all existing services. sky serve down -a - """ if sum([len(service_names) > 0, all]) != 1: argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len( @@ -4488,12 +4486,6 @@ def serve_down(service_names: List[str], all: bool, yes: bool): default=False, required=False, help='Show the load balancer logs of this service.') -@click.option('--target', - default=None, - type=click.Choice(['controller', 'load-balancer', 'replica'], - case_sensitive=False), - required=False, - help='Target to stream logs.') @click.argument('service_name', required=True, type=str) @click.argument('replica_id', required=False, type=int) @usage_lib.entrypoint @@ -4503,7 +4495,6 @@ def serve_logs( controller: bool, load_balancer: bool, replica_id: Optional[int], - target: Optional[str], ): """Tail the log of a service. @@ -4519,45 +4510,22 @@ def serve_logs( \b # Tail the logs of replica 1 sky serve logs [SERVICE_ID] 1 - \b - # Specify target to stream logs by `--target` is also supported - sky serve logs --target controller [SERVICE_ID] - sky serve logs --target load-balancer [SERVICE_ID] - sky serve logs --target replica [SERVICE_ID] 1 - \b - # If both --target and --controller/--load-balancer are specified, - # --controller/--load-balancer takes precedence. - # Tail the controller logs of a service: - sky serve logs --controller --target load-balancer [SERVICE_ID] """ have_replica_id = replica_id is not None num_flags = (controller + load_balancer + have_replica_id) if num_flags > 1: raise click.UsageError('At most one of --controller, --load-balancer, ' '[REPLICA_ID] can be specified.') - if num_flags == 0 and target is None: - raise click.UsageError( - 'One of --controller, --load-balancer, [REPLICA_ID] or --target ' - 'must be specified.') + if num_flags == 0: + raise click.UsageError('One of --controller, --load-balancer, ' + '[REPLICA_ID] must be specified.') if controller: - if target is not None: - click.secho(f'Overriding --target={target} with --controller.', - fg='yellow') target_component = serve_lib.ServiceComponent.CONTROLLER elif load_balancer: - if target is not None: - click.secho(f'Overriding --target={target} with --load-balancer.', - fg='yellow') target_component = serve_lib.ServiceComponent.LOAD_BALANCER - elif target is not None: - # Change load-balancer to load_balancer to match the enum. - target = target.replace('-', '_') - target_component = serve_lib.ServiceComponent(target) - if (target_component == serve_lib.ServiceComponent.REPLICA and - not have_replica_id): - raise click.UsageError( - 'REPLICA_ID must be specified when using --target replica.') else: + # Already checked that num_flags == 1. + assert replica_id is not None target_component = serve_lib.ServiceComponent.REPLICA try: core.serve_tail_logs(service_name, diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py index 6bf9e0e44a2..c8c9aa07765 100644 --- a/sky/serve/load_balancing_policies.py +++ b/sky/serve/load_balancing_policies.py @@ -1,11 +1,13 @@ """LoadBalancingPolicy: Policy to select endpoint.""" import random +import typing from typing import List, Optional -import fastapi - from sky import sky_logging +if typing.TYPE_CHECKING: + import fastapi + logger = sky_logging.init_logger(__name__) @@ -20,7 +22,7 @@ def set_ready_replicas(self, ready_replicas: List[str]) -> None: # TODO(tian): We should have an abstract class for Request to # compatible with all frameworks. - def select_replica(self, request: fastapi.Request) -> Optional[str]: + def select_replica(self, request: 'fastapi.Request') -> Optional[str]: raise NotImplementedError @@ -40,7 +42,7 @@ def set_ready_replicas(self, ready_replicas: List[str]) -> None: self.ready_replicas = ready_replicas self.index = 0 - def select_replica(self, request: fastapi.Request) -> Optional[str]: + def select_replica(self, request: 'fastapi.Request') -> Optional[str]: if not self.ready_replicas: return None ready_replica_url = self.ready_replicas[self.index] diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py index 3c94476ed5d..fc11b0d3110 100644 --- a/sky/serve/replica_managers.py +++ b/sky/serve/replica_managers.py @@ -381,7 +381,7 @@ def probe( Tuple of (self, is_ready, probe_time). """ replica_identity = f'replica {self.replica_id} with url {self.url}' - # # TODO(tian): This requiring the clock on each replica to be aligned, + # TODO(tian): This requiring the clock on each replica to be aligned, # which may not be true when the GCP VMs have run for a long time. We # should have a better way to do this. See #2539 for more information. probe_time = time.time() diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 34cf641c1eb..01faecd95bf 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -247,12 +247,7 @@ def update_service_status() -> None: # Skip services that is shutting down. continue controller_job_id = record['controller_job_id'] - if controller_job_id is None: - # The service just registered and the controller job is not - # scheduled yet. - # TODO(tian): Remove this once we merge #2736 and not register - # service previous than the controller job. - continue + assert controller_job_id is not None controller_status = job_lib.get_status(controller_job_id) if controller_status is None or controller_status.is_terminal(): # If controller job is not running, set it as controller failed. From fa20443aa0a6140e046895da5723b1c1ecafc458 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 13 Nov 2023 23:33:02 -0800 Subject: [PATCH 214/223] teardown failed services with --purge flag --- sky/cli.py | 12 +++- sky/core.py | 19 ++++-- sky/serve/__init__.py | 2 +- sky/serve/serve_utils.py | 142 +++++++++++++++++++++++++++------------ 4 files changed, 123 insertions(+), 52 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 556d9c49c3d..c2f7cef5943 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4409,6 +4409,11 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): default=False, is_flag=True, help='Tear down all services.') +@click.option('--purge', + '-p', + default=False, + is_flag=True, + help='Tear down services in failed status.') @click.option('--yes', '-y', is_flag=True, @@ -4416,7 +4421,7 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): required=False, help='Skip confirmation prompt.') # pylint: disable=redefined-builtin -def serve_down(service_names: List[str], all: bool, yes: bool): +def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): """Teardown service(s). SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If @@ -4440,6 +4445,9 @@ def serve_down(service_names: List[str], all: bool, yes: bool): \b # Tear down all existing services. sky serve down -a + \b + # Forcefully tear down a service in failed status. + sky serve down failed-service --purge """ if sum([len(service_names) > 0, all]) != 1: argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len( @@ -4466,7 +4474,7 @@ def serve_down(service_names: List[str], all: bool, yes: bool): abort=True, show_default=True) - sky.serve_down(service_names=service_names, all=all) + sky.serve_down(service_names=service_names, all=all, purge=purge) @serve.command('logs', cls=_DocumentedCodeCommand) diff --git a/sky/core.py b/sky/core.py index f9e38338604..e66ade01a19 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1036,6 +1036,7 @@ def serve_status( 'replica_id': (int) replica id, 'name': (str) replica name, 'status': (sky.serve.ReplicaStatus) replica status, + 'launched_at': (int) timestamp of launched, 'handle': (ResourceHandle) handle of the replica cluster, } @@ -1101,19 +1102,25 @@ def serve_status( except exceptions.CommandError as e: raise RuntimeError(e.error_msg) from e - return serve.load_serve_status(serve_status_payload) + return serve.load_service_status(serve_status_payload) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def serve_down(service_names: Optional[Union[str, List[str]]] = None, - all: bool = False) -> None: +def serve_down( + service_names: Optional[Union[str, List[str]]] = None, + all: bool = False, + purge: bool = False, +) -> None: """Teardown a service. Please refer to the sky.cli.serve_down for the docs. Args: service_names: Name of the service(s). + all: Whether to terminate all services. + purge: Whether to terminate services in a failed status. These services + may potentially lead to resource leaks. Raises: sky.exceptions.ClusterNotUpError: if the sky serve controller is not up. @@ -1145,10 +1152,8 @@ def serve_down(service_names: Optional[Union[str, List[str]]] = None, backend = backend_utils.get_backend_from_handle(handle) assert isinstance(backend, backends.CloudVmRayBackend) - if all: - code = serve.ServeCodeGen.terminate_services(None) - else: - code = serve.ServeCodeGen.terminate_services(service_names) + service_names = None if all else service_names + code = serve.ServeCodeGen.terminate_services(service_names, purge) try: returncode, stdout, _ = backend.run_on_head(handle, diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index e25d24a079e..920d7341913 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -15,8 +15,8 @@ from sky.serve.serve_utils import generate_remote_tmp_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name from sky.serve.serve_utils import generate_service_name -from sky.serve.serve_utils import load_serve_status from sky.serve.serve_utils import load_service_initialization_result +from sky.serve.serve_utils import load_service_status from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent from sky.serve.serve_utils import SKY_SERVE_CONTROLLER_NAME diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 01faecd95bf..edaf9979354 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -6,6 +6,7 @@ import pickle import re import shlex +import shutil import threading import time import typing @@ -58,7 +59,7 @@ class UserSignal(enum.Enum): # Stop the controller, load balancer and all replicas. TERMINATE = 'terminate' - # TODO(tian): Add more signals, such as update or pause. + # TODO(tian): Add more signals, such as pause. def error_type(self) -> Type[Exception]: """Get the error corresponding to the signal.""" @@ -280,63 +281,119 @@ def _get_service_status( def get_service_status_encoded(service_names: Optional[List[str]]) -> str: - serve_statuses = [] + service_statuses = [] if service_names is None: # Get all service names service_names = serve_state.get_glob_service_names(None) for service_name in service_names: - serve_status = _get_service_status(service_name) - if serve_status is None: + service_status = _get_service_status(service_name) + if service_status is None: continue - serve_statuses.append({ + service_statuses.append({ k: base64.b64encode(pickle.dumps(v)).decode('utf-8') - for k, v in serve_status.items() + for k, v in service_status.items() }) - return common_utils.encode_payload(serve_statuses) + return common_utils.encode_payload(service_statuses) -def load_serve_status(payload: str) -> List[Dict[str, Any]]: - serve_statuses_encoded = common_utils.decode_payload(payload) - serve_statuses = [] - for serve_status in serve_statuses_encoded: - serve_statuses.append({ +def load_service_status(payload: str) -> List[Dict[str, Any]]: + service_statuses_encoded = common_utils.decode_payload(payload) + service_statuses = [] + for service_status in service_statuses_encoded: + service_statuses.append({ k: pickle.loads(base64.b64decode(v)) - for k, v in serve_status.items() + for k, v in service_status.items() }) - return serve_statuses + return service_statuses -def terminate_services(service_names: Optional[List[str]]) -> str: +def _terminate_failed_services( + service_name: str, + service_status: serve_state.ServiceStatus) -> Optional[str]: + """Terminate service in failed status. + + Services included in ServiceStatus.failed_statuses() do not have an + active controller process, so we can't send a file terminate signal + to the controller. Instead, we manually cleanup database record for + the service and alert the user about a potential resource leak. + + Returns: + A message indicating potential resource leak (if any). If no + resource leak is detected, return None. + """ + remaining_replica_clusters = [] + # The controller should have already attempted to terminate those + # replicas, so we don't need to try again here. + for replica_info in serve_state.get_replica_infos(service_name): + # TODO(tian): Refresh latest status of the cluster. + if global_user_state.get_cluster_from_name( + replica_info.cluster_name) is not None: + remaining_replica_clusters.append(f'{replica_info.cluster_name!r}') + serve_state.remove_replica(service_name, replica_info.replica_id) + + service_dir = os.path.expanduser( + generate_remote_service_dir_name(service_name)) + shutil.rmtree(service_dir) + serve_state.remove_service(service_name) + + if not remaining_replica_clusters: + return None + remaining_identity = ', '.join(remaining_replica_clusters) + return (f'{colorama.Fore.YELLOW}terminate service {service_name!r} with ' + f'failed status ({service_status}). This may indicate a resource ' + 'leak. Please check the following SkyPilot clusters on the ' + f'controller: {remaining_identity}{colorama.Style.RESET_ALL}') + + +def terminate_services(service_names: Optional[List[str]], purge: bool) -> str: service_names = serve_state.get_glob_service_names(service_names) terminated_service_names = [] + messages = [] for service_name in service_names: - serve_status = _get_service_status(service_name, - with_replica_info=False) - assert serve_status is not None - if (serve_status['status'] - in serve_state.ServiceStatus.refuse_to_terminate_statuses()): - # TODO(tian): Cleanup replicas for CONTROLLER_FAILED status. Seems - # like spot doesn't implement this yet? + service_status = _get_service_status(service_name, + with_replica_info=False) + assert service_status is not None + if service_status['status'] == serve_state.ServiceStatus.SHUTTING_DOWN: + # Already scheduled to be terminated. continue - # Send the terminate signal to controller. - signal_file = pathlib.Path( - constants.SIGNAL_FILE_PATH.format(service_name)) - # Filelock is needed to prevent race condition between signal - # check/removal and signal writing. - with filelock.FileLock(str(signal_file) + '.lock'): - with signal_file.open(mode='w') as f: - # TODO(tian): Probably write a dict instead of bare string - # to the file? It will be helpful for update cases. - f.write(UserSignal.TERMINATE.value) - f.flush() + if (service_status['status'] + in serve_state.ServiceStatus.failed_statuses()): + if purge: + message = _terminate_failed_services(service_name, + service_status['status']) + if message is not None: + messages.append(message) + else: + messages.append( + f'{colorama.Fore.YELLOW}Service {service_name!r} is in ' + f'failed status ({service_status["status"]}). Skipping ' + 'its termination as it could lead to a resource leak. ' + f'(Use `sky serve down {service_name} --purge` to ' + 'forcefully terminate the service.)' + f'{colorama.Style.RESET_ALL}') + # Don't add to terminated_service_names since it's not + # actually terminated. + continue + else: + # Send the terminate signal to controller. + signal_file = pathlib.Path( + constants.SIGNAL_FILE_PATH.format(service_name)) + # Filelock is needed to prevent race condition between signal + # check/removal and signal writing. + with filelock.FileLock(str(signal_file) + '.lock'): + with signal_file.open(mode='w') as f: + f.write(UserSignal.TERMINATE.value) + f.flush() terminated_service_names.append(f'{service_name!r}') if len(terminated_service_names) == 0: - return 'No service to terminate.' - identity_str = f'Service {terminated_service_names[0]} is' - if len(terminated_service_names) > 1: - terminated_service_names_str = ', '.join(terminated_service_names) - identity_str = f'Services {terminated_service_names_str} are' - return f'{identity_str} scheduled to be terminated.' + messages.append('No service to terminate.') + else: + identity_str = f'Service {terminated_service_names[0]} is' + if len(terminated_service_names) > 1: + terminated_service_names_str = ', '.join(terminated_service_names) + identity_str = f'Services {terminated_service_names_str} are' + messages.append(f'{identity_str} scheduled to be terminated.') + return '\n'.join(messages) def wait_service_initialization(service_name: str, job_id: int) -> str: @@ -588,10 +645,11 @@ def get_service_status(cls, service_names: Optional[List[str]]) -> str: return cls._build(code) @classmethod - def terminate_services(cls, service_names: Optional[List[str]]) -> str: + def terminate_services(cls, service_names: Optional[List[str]], + purge: bool) -> str: code = [ - f'msg = serve_utils.terminate_services({service_names!r})', - 'print(msg, end="", flush=True)' + f'msg = serve_utils.terminate_services({service_names!r}, ' + f'purge={purge})', 'print(msg, end="", flush=True)' ] return cls._build(code) From 779383376824e7fdee3be66d76506e4371cf88bd Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 14 Nov 2023 10:23:51 -0800 Subject: [PATCH 215/223] move core api to sky/serve/api.py --- sky/__init__.py | 5 - sky/cli.py | 16 +- sky/core.py | 257 ----------------- sky/execution.py | 231 +-------------- sky/serve/__init__.py | 12 +- sky/serve/api.py | 510 ++++++++++++++++++++++++++++++++++ sky/serve/serve_utils.py | 4 +- sky/serve/service.py | 2 +- sky/utils/controller_utils.py | 2 +- 9 files changed, 530 insertions(+), 509 deletions(-) create mode 100644 sky/serve/api.py diff --git a/sky/__init__.py b/sky/__init__.py index 38f9ae36a12..b27de4a5c3f 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -49,7 +49,6 @@ def get_git_commit(): from sky.core import download_logs from sky.core import job_status from sky.core import queue -from sky.core import serve_down from sky.core import spot_cancel from sky.core import spot_queue from sky.core import spot_status @@ -65,7 +64,6 @@ def get_git_commit(): from sky.data import StoreType from sky.execution import exec # pylint: disable=redefined-builtin from sky.execution import launch -from sky.execution import serve_up from sky.execution import spot_launch from sky.optimizer import Optimizer from sky.optimizer import OptimizeTarget @@ -116,7 +114,6 @@ def get_git_commit(): 'launch', 'exec', 'spot_launch', - 'serve_up', # core APIs 'status', 'start', @@ -137,6 +134,4 @@ def get_git_commit(): # core APIs Storage Management 'storage_ls', 'storage_delete', - # core APIs Serve Management - 'serve_down', ] diff --git a/sky/cli.py b/sky/cli.py index c2f7cef5943..2aaf75c4212 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1692,7 +1692,7 @@ def _get_services(service_names: Optional[List[str]], if not service_names: # Change empty list to None service_names = None - service_records = core.serve_status(service_names) + service_records = sky.serve.status(service_names) num_services = len(service_records) except exceptions.ClusterNotUpError as e: controller_status = e.cluster_status @@ -2863,7 +2863,7 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): with rich_utils.safe_status( '[bold cyan]Checking for running services[/]'): try: - services = core.serve_status() + services = sky.serve.status() except exceptions.ClusterNotUpError: cluster_status = backend_utils.refresh_cluster_status_handle( controller_name) @@ -4282,7 +4282,7 @@ def serve_up( if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) - sky.serve_up(task, service_name) + sky.serve.up(task, service_name) @serve.command('status', cls=_DocumentedCodeCommand) @@ -4474,7 +4474,7 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): abort=True, show_default=True) - sky.serve_down(service_names=service_names, all=all, purge=purge) + sky.serve.down(service_names=service_names, all=all, purge=purge) @serve.command('logs', cls=_DocumentedCodeCommand) @@ -4536,10 +4536,10 @@ def serve_logs( assert replica_id is not None target_component = serve_lib.ServiceComponent.REPLICA try: - core.serve_tail_logs(service_name, - target=target_component, - replica_id=replica_id, - follow=follow) + sky.serve.tail_logs(service_name, + target=target_component, + replica_id=replica_id, + follow=follow) except exceptions.ClusterNotUpError: # Hint messages already printed by the call above. sys.exit(1) diff --git a/sky/core.py b/sky/core.py index e66ade01a19..2488320c058 100644 --- a/sky/core.py +++ b/sky/core.py @@ -11,7 +11,6 @@ from sky import data from sky import exceptions from sky import global_user_state -from sky import serve from sky import sky_logging from sky import spot from sky import status_lib @@ -993,259 +992,3 @@ def storage_delete(name: str) -> None: source=handle.source, sync_on_reconstruction=False) store_object.delete() - - -# ====================== -# = Service Management = -# ====================== - - -@usage_lib.entrypoint -def serve_status( - service_names: Optional[Union[str, - List[str]]] = None) -> List[Dict[str, Any]]: - """Get service statuses. - - If service_names is given, return those services. Otherwise, return all - services. - - Each returned value has the following fields: - - .. code-block:: python - - { - 'name': (str) service name, - 'controller_job_id': (int) the job id of the controller, - 'uptime': (int) uptime in seconds, - 'status': (sky.ServiceStatus) service status, - 'controller_port': (Optional[int]) controller port, - 'load_balancer_port': (Optional[int]) load balancer port, - 'policy': (Optional[str]) load balancer policy description, - 'auto_restart': (bool) whether the service replica will be - auto-restarted, - 'requested_resources': (sky.Resources) requested resources - for replica, - 'replica_info': (List[Dict[str, Any]]) replica information, - } - - Each entry in replica_info has the following fields: - - .. code-block:: python - - { - 'replica_id': (int) replica id, - 'name': (str) replica name, - 'status': (sky.serve.ReplicaStatus) replica status, - 'launched_at': (int) timestamp of launched, - 'handle': (ResourceHandle) handle of the replica cluster, - } - - For possible service statuses and replica statuses, please refer to - sky.cli.serve_status. - - Args: - service_names: a single or a list of service names to query. If None, - query all services. - - Returns: - A list of dicts, with each dict containing the information of a service. - If a service is not found, it will be omitted from the returned list. - - Raises: - RuntimeError: if failed to get the service status. - exceptions.ClusterNotUpError: if the sky serve controller is not up. - """ - if service_names is not None: - if isinstance(service_names, str): - service_names = [service_names] - - try: - backend_utils.check_network_connection() - except exceptions.NetworkError as e: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'Failed to refresh service status due to network error.') from e - - # TODO(tian): This is so slow... It will take ~10s to refresh the status - # of controller. Can we optimize this? - controller_status, handle = backend_utils.is_controller_up( - controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, - stopped_message='No service is found.') - - if handle is None or handle.head_ip is None: - # When the controller is STOPPED, the head_ip will be None, as - # it will be set in global_user_state.remove_cluster(). - # We do not directly check for UP because the controller may be - # in INIT state during another `sky serve up`, but still have - # head_ip available. In this case, we can still try to ssh - # into the controller and fetch the job table. - raise exceptions.ClusterNotUpError('Sky serve controller is not up.', - cluster_status=controller_status) - - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend) - - code = serve.ServeCodeGen.get_service_status(service_names) - returncode, serve_status_payload, stderr = backend.run_on_head( - handle, - code, - require_outputs=True, - stream_logs=False, - separate_stderr=True) - - try: - subprocess_utils.handle_returncode(returncode, - code, - 'Failed to fetch services', - stderr, - stream_logs=True) - except exceptions.CommandError as e: - raise RuntimeError(e.error_msg) from e - - return serve.load_service_status(serve_status_payload) - - -@usage_lib.entrypoint -# pylint: disable=redefined-builtin -def serve_down( - service_names: Optional[Union[str, List[str]]] = None, - all: bool = False, - purge: bool = False, -) -> None: - """Teardown a service. - - Please refer to the sky.cli.serve_down for the docs. - - Args: - service_names: Name of the service(s). - all: Whether to terminate all services. - purge: Whether to terminate services in a failed status. These services - may potentially lead to resource leaks. - - Raises: - sky.exceptions.ClusterNotUpError: if the sky serve controller is not up. - ValueError: if the arguments are invalid. - RuntimeError: if failed to terminate the service. - """ - if service_names is None: - service_names = [] - if isinstance(service_names, str): - service_names = [service_names] - cluster_status, handle = backend_utils.is_controller_up( - controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, - stopped_message='All services should have terminated.') - if handle is None or handle.head_ip is None: - # The error message is already printed in - # backend_utils.is_controller_up - # TODO(zhwu): Move the error message into the exception. - with ux_utils.print_exception_no_traceback(): - raise exceptions.ClusterNotUpError(message='', - cluster_status=cluster_status) - - service_names_str = ','.join(service_names) - if sum([len(service_names) > 0, all]) != 1: - argument_str = f'service_names={service_names_str}' if len( - service_names) > 0 else '' - argument_str += ' all' if all else '' - raise ValueError('Can only specify one of service_names or all. ' - f'Provided {argument_str!r}.') - - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend) - service_names = None if all else service_names - code = serve.ServeCodeGen.terminate_services(service_names, purge) - - try: - returncode, stdout, _ = backend.run_on_head(handle, - code, - require_outputs=True, - stream_logs=False) - except exceptions.FetchIPError as e: - raise RuntimeError( - 'Failed to fetch controller IP. Please refresh controller ' - f'status by `sky status -r {serve.SKY_SERVE_CONTROLLER_NAME}` ' - 'and try again.') from e - - try: - subprocess_utils.handle_returncode(returncode, code, - 'Failed to terminate service', - stdout) - except exceptions.CommandError as e: - raise RuntimeError(e.error_msg) from e - - sky_logging.print(stdout) - - -@usage_lib.entrypoint -def serve_tail_logs( - service_name: str, - *, - target: Union[str, serve.ServiceComponent], - replica_id: Optional[int] = None, - follow: bool = True, -) -> None: - """Tail logs for a service. - - Usage: - core.serve_tail_logs( - service_name, - target=, - follow=False, # Optionally, default to True - # replica_id=3, # Must be specified when target is REPLICA. - ) - - `target` is a enum of sky.serve.ServiceComponent, which can be one of: - - CONTROLLER - - LOAD_BALANCER - - REPLICA - Pass target as a lower-case string is also supported, e.g. - target='controller'. - To use REPLICA, you must specify `replica_id`. - - To tail controller logs: - # follow default to True - core.serve_tail_logs( - service_name, target=sky.serve.ServiceComponent.CONTROLLER) - - To print replica 3 logs: - # Pass target as a lower-case string is also supported. - core.serve_tail_logs( - service_name, target='replica', - follow=False, replica_id=3) - - Raises: - sky.exceptions.ClusterNotUpError: the sky serve controller is not up. - ValueError: arguments not valid, or failed to tail the logs. - """ - if isinstance(target, str): - target = serve.ServiceComponent(target) - if not isinstance(target, serve.ServiceComponent): - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'`target` must be a string or ' - f'sky.serve.ServiceComponent, got {type(target)}.') - if target == serve.ServiceComponent.REPLICA: - if replica_id is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - '`replica_id` must be specified when using target=REPLICA.') - else: - if replica_id is not None: - with ux_utils.print_exception_no_traceback(): - raise ValueError('`replica_id` must be None when using ' - 'target=CONTROLLER/LOAD_BALANCER.') - controller_status, handle = backend_utils.is_controller_up( - controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, - stopped_message='No service is found.') - if handle is None or handle.head_ip is None: - msg = 'No service is found.' - if controller_status == status_lib.ClusterStatus.INIT: - msg = '' - raise exceptions.ClusterNotUpError(msg, - cluster_status=controller_status) - backend = backend_utils.get_backend_from_handle(handle) - assert isinstance(backend, backends.CloudVmRayBackend), backend - backend.tail_serve_logs(handle, - service_name, - target, - replica_id, - follow=follow) diff --git a/sky/execution.py b/sky/execution.py index a5965bf9f75..cc2ab11711d 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -5,7 +5,6 @@ import copy import enum import os -import re import tempfile from typing import Any, List, Optional, Tuple, Union import uuid @@ -15,16 +14,13 @@ import sky from sky import backends from sky import clouds -from sky import exceptions from sky import global_user_state from sky import optimizer -from sky import serve from sky import sky_logging from sky import spot from sky import task as task_lib from sky.backends import backend_utils from sky.clouds import gcp -from sky.skylet import constants from sky.usage import usage_lib from sky.utils import common_utils from sky.utils import controller_utils @@ -147,7 +143,7 @@ def _maybe_clone_disk_from_cluster(clone_disk_from: Optional[str], return task -def _execute( +def execute( entrypoint: Union['sky.Task', 'sky.Dag'], dryrun: bool = False, down: bool = False, @@ -502,7 +498,7 @@ def launch( controller_utils.check_cluster_name_not_controller( cluster_name, operation_str='sky.launch') - return _execute( + return execute( entrypoint=entrypoint, dryrun=dryrun, down=down, @@ -599,7 +595,7 @@ def exec( # pylint: disable=redefined-builtin operation='executing tasks', check_cloud_vm_ray_backend=False, dryrun=dryrun) - return _execute( + return execute( entrypoint=entrypoint, dryrun=dryrun, down=down, @@ -717,7 +713,7 @@ def spot_launch( f'Launching managed spot job {dag.name!r} from spot controller...' f'{colorama.Style.RESET_ALL}') print('Launching spot controller...') - _execute( + execute( entrypoint=controller_task, stream_logs=stream_logs, cluster_name=controller_name, @@ -726,222 +722,3 @@ def spot_launch( CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) - - -@usage_lib.entrypoint -def serve_up( - task: 'sky.Task', - service_name: Optional[str] = None, -) -> None: - """Spin up a service. - - Please refer to the sky.cli.serve_up for the document. - - Args: - task: sky.Task to serve up. - service_name: Name of the service. - """ - if service_name is None: - service_name = serve.generate_service_name() - - # The service name will be used as: - # 1. controller cluster name: 'sky-serve-controller-' - # 2. replica cluster name: '-' - # In both cases, service name shares the same regex with cluster name. - if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Service name {service_name!r} is invalid: ' - f'ensure it is fully matched by regex (e.g., ' - 'only contains lower letters, numbers and dash): ' - f'{constants.CLUSTER_NAME_VALID_REGEX}') - - if task.service is None: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError('Service section not found.') - - assert len(task.resources) == 1, task - requested_resources = list(task.resources)[0] - if requested_resources.ports is None or len(requested_resources.ports) != 1: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Must only specify one port in resources. Each replica ' - 'will use the port specified as application ingress port.') - - controller_utils.maybe_translate_local_file_mounts_and_sync_up(task, - path='serve') - - with tempfile.NamedTemporaryFile( - prefix=f'service-task-{service_name}-', - mode='w', - ) as service_file, tempfile.NamedTemporaryFile( - prefix=f'controller-task-{service_name}-', - mode='w', - ) as controller_file: - controller_name = serve.SKY_SERVE_CONTROLLER_NAME - task_config = task.to_yaml_config() - common_utils.dump_yaml(service_file.name, task_config) - remote_tmp_task_yaml_path = ( - serve.generate_remote_tmp_task_yaml_file_name(service_name)) - remote_config_yaml_path = ( - serve.generate_remote_config_yaml_file_name(service_name)) - controller_log_file = ( - serve.generate_remote_controller_log_file_name(service_name)) - extra_vars, controller_resources_config = ( - controller_utils.skypilot_config_setup( - controller_type='serve', - controller_resources_config=serve.CONTROLLER_RESOURCES, - remote_user_config_path=remote_config_yaml_path)) - try: - controller_resources = sky.Resources.from_yaml_config( - controller_resources_config) - except ValueError as e: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - controller_utils.CONTROLLER_RESOURCES_NOT_VALID_MESSAGE. - format(controller_type='serve', - err=common_utils.format_exception( - e, use_bracket=True))) from e - vars_to_fill = { - 'remote_task_yaml_path': remote_tmp_task_yaml_path, - 'local_task_yaml_path': service_file.name, - 'google_sdk_installation_commands': - gcp.GOOGLE_SDK_INSTALLATION_COMMAND, - 'service_name': service_name, - 'controller_log_file': controller_log_file, - **extra_vars, - } - backend_utils.fill_template(serve.CONTROLLER_TEMPLATE, - vars_to_fill, - output_path=controller_file.name) - controller_task = task_lib.Task.from_yaml(controller_file.name) - controller_exist = ( - global_user_state.get_cluster_from_name(controller_name) - is not None) - controller_cloud = ( - requested_resources.cloud if not controller_exist and - controller_resources.cloud is None else controller_resources.cloud) - # TODO(tian): Probably run another sky.launch after we get the load - # balancer port from the controller? So we don't need to open so many - # ports here. Or, we should have a nginx traffic control to refuse - # any connection to the unregistered ports. - controller_resources = controller_resources.copy( - cloud=controller_cloud, ports=[serve.LOAD_BALANCER_PORT_RANGE]) - controller_task.set_resources(controller_resources) - - # # Set service_name so the backend will know to modify default ray - # task CPU usage to custom value instead of default 0.5 vCPU. We need - # to set it to a smaller value to support a larger number of services. - controller_task.service_name = service_name - - print(f'{colorama.Fore.YELLOW}Launching controller for ' - f'{service_name!r}...{colorama.Style.RESET_ALL}') - # We directly submit the request to the controller and let the - # controller to check name conflict. Suppose we have multiple - # sky.serve_up() with same service name, the first one will - # successfully write its job id to controller service database; - # and for all following sky.serve_up, the controller will throw - # an exception (name conflict detected) and exit. Therefore the - # controller job id in database could be use as an indicator of - # whether the service is already running. If the id is the same - # with the current job id, we know the service is up and running - # for the first time; otherwise it is a name conflict. - controller_job_id, controller_handle = _execute( - entrypoint=controller_task, - stream_logs=False, - cluster_name=controller_name, - detach_run=True, - idle_minutes_to_autostop=controller_utils. - CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, - retry_until_up=True, - ) - - style = colorama.Style - fore = colorama.Fore - - assert controller_job_id is not None and controller_handle is not None - # TODO(tian): Cache endpoint locally to speedup. Endpoint won't - # change after the first time, so there is no consistency issue. - with rich_utils.safe_status( - '[cyan]Waiting for the service to initialize[/]'): - # This function will check the controller job id in the database - # and return the endpoint if the job id matches. Otherwise it will - # return None. - code = serve.ServeCodeGen.wait_service_initialization( - service_name, controller_job_id) - backend = backend_utils.get_backend_from_handle(controller_handle) - assert isinstance(backend, backends.CloudVmRayBackend) - assert isinstance(controller_handle, - backends.CloudVmRayResourceHandle) - returncode, lb_port_payload, _ = backend.run_on_head( - controller_handle, - code, - require_outputs=True, - stream_logs=False) - try: - subprocess_utils.handle_returncode( - returncode, code, 'Failed to wait for service initialization', - lb_port_payload) - except exceptions.CommandError: - statuses = backend.get_job_status(controller_handle, - [controller_job_id], - stream_logs=False) - controller_job_status = list(statuses.values())[0] - if controller_job_status == sky.JobStatus.PENDING: - # Max number of services reached due to vCPU constraint. - # The controller job is pending due to ray job scheduling. - # We manually cancel the job here. - backend.cancel_jobs(controller_handle, [controller_job_id]) - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'Max number of services reached. ' - 'To spin up more services, please ' - 'tear down some existing services.') from None - else: - # Possible cases: - # (1) name conflict; - # (2) max number of services reached due to memory - # constraint. The job will successfully run on the - # controller, but there will be an error thrown due - # to memory constraint check in the controller. - # See sky/serve/service.py for more details. - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'Failed to spin up the service. Please ' - 'check the logs above for more details.') from None - else: - lb_port = serve.load_service_initialization_result(lb_port_payload) - endpoint = f'{controller_handle.head_ip}:{lb_port}' - - sky_logging.print( - f'{fore.CYAN}Service name: ' - f'{style.BRIGHT}{service_name}{style.RESET_ALL}' - f'\n{fore.CYAN}Endpoint URL: ' - f'{style.BRIGHT}{endpoint}{style.RESET_ALL}' - '\nTo see detailed info:\t\t' - f'{backend_utils.BOLD}sky serve status {service_name} ' - f'[--endpoint]{backend_utils.RESET_BOLD}' - '\nTo teardown the service:\t' - f'{backend_utils.BOLD}sky serve down {service_name}' - f'{backend_utils.RESET_BOLD}' - '\n' - '\nTo see logs of a replica:\t' - f'{backend_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]' - f'{backend_utils.RESET_BOLD}' - '\nTo see logs of load balancer:\t' - f'{backend_utils.BOLD}sky serve logs --load-balancer {service_name}' - f'{backend_utils.RESET_BOLD}' - '\nTo see logs of controller:\t' - f'{backend_utils.BOLD}sky serve logs --controller {service_name}' - f'{backend_utils.RESET_BOLD}' - '\n' - '\nTo monitor replica status:\t' - f'{backend_utils.BOLD}watch -n10 sky serve status {service_name}' - f'{backend_utils.RESET_BOLD}' - '\nTo send a test request:\t\t' - f'{backend_utils.BOLD}curl -L {endpoint}' - f'{backend_utils.RESET_BOLD}' - '\n' - f'\n{fore.GREEN}SkyServe is spinning up your service now.' - f'{style.RESET_ALL}' - f'\n{fore.GREEN}The replicas should be ready within a ' - f'short time.{style.RESET_ALL}') diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 920d7341913..55ce2fada74 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -1,22 +1,18 @@ """Modules for SkyServe services.""" import os -from sky.serve.constants import CONTROLLER_RESOURCES -from sky.serve.constants import CONTROLLER_TEMPLATE +from sky.serve.api import down +from sky.serve.api import status +from sky.serve.api import tail_logs +from sky.serve.api import up from sky.serve.constants import ENDPOINT_PROBE_INTERVAL_SECONDS from sky.serve.constants import LB_CONTROLLER_SYNC_INTERVAL_SECONDS -from sky.serve.constants import LOAD_BALANCER_PORT_RANGE from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.constants import SKYSERVE_METADATA_DIR from sky.serve.serve_state import ReplicaStatus from sky.serve.serve_state import ServiceStatus -from sky.serve.serve_utils import generate_remote_config_yaml_file_name -from sky.serve.serve_utils import generate_remote_controller_log_file_name -from sky.serve.serve_utils import generate_remote_tmp_task_yaml_file_name from sky.serve.serve_utils import generate_replica_cluster_name from sky.serve.serve_utils import generate_service_name -from sky.serve.serve_utils import load_service_initialization_result -from sky.serve.serve_utils import load_service_status from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent from sky.serve.serve_utils import SKY_SERVE_CONTROLLER_NAME diff --git a/sky/serve/api.py b/sky/serve/api.py new file mode 100644 index 00000000000..2505878f357 --- /dev/null +++ b/sky/serve/api.py @@ -0,0 +1,510 @@ +"""SkyServe core APIs.""" +import re +import tempfile +from typing import Any, Dict, List, Optional, Union + +import colorama + +import sky +from sky import backends +from sky import exceptions +from sky import global_user_state +from sky import sky_logging +from sky import status_lib +from sky.backends import backend_utils +from sky.clouds import gcp +from sky.serve import constants as serve_constants +from sky.serve import serve_utils +from sky.skylet import constants +from sky.usage import usage_lib +from sky.utils import common_utils +from sky.utils import rich_utils +from sky.utils import subprocess_utils +from sky.utils import ux_utils + + +@usage_lib.entrypoint +def up( + task: 'sky.Task', + service_name: Optional[str] = None, +) -> None: + """Spin up a service. + + Please refer to the sky.cli.serve_up for the document. + + Args: + task: sky.Task to serve up. + service_name: Name of the service. + """ + # Import here to avoid circular import. + # pylint: disable=import-outside-toplevel + from sky import execution + from sky import task as task_lib + from sky.utils import controller_utils + if service_name is None: + service_name = serve_utils.generate_service_name() + + # The service name will be used as: + # 1. controller cluster name: 'sky-serve-controller-' + # 2. replica cluster name: '-' + # In both cases, service name shares the same regex with cluster name. + if re.fullmatch(constants.CLUSTER_NAME_VALID_REGEX, service_name) is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Service name {service_name!r} is invalid: ' + f'ensure it is fully matched by regex (e.g., ' + 'only contains lower letters, numbers and dash): ' + f'{constants.CLUSTER_NAME_VALID_REGEX}') + + if task.service is None: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError('Service section not found.') + + assert len(task.resources) == 1, task + requested_resources = list(task.resources)[0] + if requested_resources.ports is None or len(requested_resources.ports) != 1: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Must only specify one port in resources. Each replica ' + 'will use the port specified as application ingress port.') + + controller_utils.maybe_translate_local_file_mounts_and_sync_up(task, + path='serve') + + with tempfile.NamedTemporaryFile( + prefix=f'service-task-{service_name}-', + mode='w', + ) as service_file, tempfile.NamedTemporaryFile( + prefix=f'controller-task-{service_name}-', + mode='w', + ) as controller_file: + controller_name = serve_utils.SKY_SERVE_CONTROLLER_NAME + task_config = task.to_yaml_config() + common_utils.dump_yaml(service_file.name, task_config) + remote_tmp_task_yaml_path = ( + serve_utils.generate_remote_tmp_task_yaml_file_name(service_name)) + remote_config_yaml_path = ( + serve_utils.generate_remote_config_yaml_file_name(service_name)) + controller_log_file = ( + serve_utils.generate_remote_controller_log_file_name(service_name)) + extra_vars, controller_resources_config = ( + controller_utils.skypilot_config_setup( + controller_type='serve', + controller_resources_config=serve_constants. + CONTROLLER_RESOURCES, + remote_user_config_path=remote_config_yaml_path)) + try: + controller_resources = sky.Resources.from_yaml_config( + controller_resources_config) + except ValueError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + controller_utils.CONTROLLER_RESOURCES_NOT_VALID_MESSAGE. + format(controller_type='serve', + err=common_utils.format_exception( + e, use_bracket=True))) from e + vars_to_fill = { + 'remote_task_yaml_path': remote_tmp_task_yaml_path, + 'local_task_yaml_path': service_file.name, + 'google_sdk_installation_commands': + gcp.GOOGLE_SDK_INSTALLATION_COMMAND, + 'service_name': service_name, + 'controller_log_file': controller_log_file, + **extra_vars, + } + backend_utils.fill_template(serve_constants.CONTROLLER_TEMPLATE, + vars_to_fill, + output_path=controller_file.name) + controller_task = task_lib.Task.from_yaml(controller_file.name) + controller_exist = ( + global_user_state.get_cluster_from_name(controller_name) + is not None) + controller_cloud = ( + requested_resources.cloud if not controller_exist and + controller_resources.cloud is None else controller_resources.cloud) + # TODO(tian): Probably run another sky.launch after we get the load + # balancer port from the controller? So we don't need to open so many + # ports here. Or, we should have a nginx traffic control to refuse + # any connection to the unregistered ports. + controller_resources = controller_resources.copy( + cloud=controller_cloud, + ports=[serve_constants.LOAD_BALANCER_PORT_RANGE]) + controller_task.set_resources(controller_resources) + + # # Set service_name so the backend will know to modify default ray + # task CPU usage to custom value instead of default 0.5 vCPU. We need + # to set it to a smaller value to support a larger number of services. + controller_task.service_name = service_name + + print(f'{colorama.Fore.YELLOW}Launching controller for ' + f'{service_name!r}...{colorama.Style.RESET_ALL}') + # We directly submit the request to the controller and let the + # controller to check name conflict. Suppose we have multiple + # sky.serve.up() with same service name, the first one will + # successfully write its job id to controller service database; + # and for all following sky.serve.up, the controller will throw + # an exception (name conflict detected) and exit. Therefore the + # controller job id in database could be use as an indicator of + # whether the service is already running. If the id is the same + # with the current job id, we know the service is up and running + # for the first time; otherwise it is a name conflict. + controller_job_id, controller_handle = execution.execute( + entrypoint=controller_task, + stream_logs=False, + cluster_name=controller_name, + detach_run=True, + idle_minutes_to_autostop=controller_utils. + CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, + retry_until_up=True, + ) + + style = colorama.Style + fore = colorama.Fore + + assert controller_job_id is not None and controller_handle is not None + # TODO(tian): Cache endpoint locally to speedup. Endpoint won't + # change after the first time, so there is no consistency issue. + with rich_utils.safe_status( + '[cyan]Waiting for the service to initialize[/]'): + # This function will check the controller job id in the database + # and return the endpoint if the job id matches. Otherwise it will + # return None. + code = serve_utils.ServeCodeGen.wait_service_initialization( + service_name, controller_job_id) + backend = backend_utils.get_backend_from_handle(controller_handle) + assert isinstance(backend, backends.CloudVmRayBackend) + assert isinstance(controller_handle, + backends.CloudVmRayResourceHandle) + returncode, lb_port_payload, _ = backend.run_on_head( + controller_handle, + code, + require_outputs=True, + stream_logs=False) + try: + subprocess_utils.handle_returncode( + returncode, code, 'Failed to wait for service initialization', + lb_port_payload) + except exceptions.CommandError: + statuses = backend.get_job_status(controller_handle, + [controller_job_id], + stream_logs=False) + controller_job_status = list(statuses.values())[0] + if controller_job_status == sky.JobStatus.PENDING: + # Max number of services reached due to vCPU constraint. + # The controller job is pending due to ray job scheduling. + # We manually cancel the job here. + backend.cancel_jobs(controller_handle, [controller_job_id]) + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'Max number of services reached. ' + 'To spin up more services, please ' + 'tear down some existing services.') from None + else: + # Possible cases: + # (1) name conflict; + # (2) max number of services reached due to memory + # constraint. The job will successfully run on the + # controller, but there will be an error thrown due + # to memory constraint check in the controller. + # See sky/serve/service.py for more details. + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'Failed to spin up the service. Please ' + 'check the logs above for more details.') from None + else: + lb_port = serve_utils.load_service_initialization_result( + lb_port_payload) + endpoint = f'{controller_handle.head_ip}:{lb_port}' + + sky_logging.print( + f'{fore.CYAN}Service name: ' + f'{style.BRIGHT}{service_name}{style.RESET_ALL}' + f'\n{fore.CYAN}Endpoint URL: ' + f'{style.BRIGHT}{endpoint}{style.RESET_ALL}' + '\nTo see detailed info:\t\t' + f'{backend_utils.BOLD}sky serve status {service_name} ' + f'[--endpoint]{backend_utils.RESET_BOLD}' + '\nTo teardown the service:\t' + f'{backend_utils.BOLD}sky serve down {service_name}' + f'{backend_utils.RESET_BOLD}' + '\n' + '\nTo see logs of a replica:\t' + f'{backend_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of load balancer:\t' + f'{backend_utils.BOLD}sky serve logs --load-balancer {service_name}' + f'{backend_utils.RESET_BOLD}' + '\nTo see logs of controller:\t' + f'{backend_utils.BOLD}sky serve logs --controller {service_name}' + f'{backend_utils.RESET_BOLD}' + '\n' + '\nTo monitor replica status:\t' + f'{backend_utils.BOLD}watch -n10 sky serve status {service_name}' + f'{backend_utils.RESET_BOLD}' + '\nTo send a test request:\t\t' + f'{backend_utils.BOLD}curl -L {endpoint}' + f'{backend_utils.RESET_BOLD}' + '\n' + f'\n{fore.GREEN}SkyServe is spinning up your service now.' + f'{style.RESET_ALL}' + f'\n{fore.GREEN}The replicas should be ready within a ' + f'short time.{style.RESET_ALL}') + + +@usage_lib.entrypoint +# pylint: disable=redefined-builtin +def down( + service_names: Optional[Union[str, List[str]]] = None, + all: bool = False, + purge: bool = False, +) -> None: + """Teardown a service. + + Please refer to the sky.cli.serve_down for the docs. + + Args: + service_names: Name of the service(s). + all: Whether to terminate all services. + purge: Whether to terminate services in a failed status. These services + may potentially lead to resource leaks. + + Raises: + sky.exceptions.ClusterNotUpError: if the sky serve controller is not up. + ValueError: if the arguments are invalid. + RuntimeError: if failed to terminate the service. + """ + # Import here to avoid circular import. + # pylint: disable=import-outside-toplevel + from sky.utils import controller_utils + if service_names is None: + service_names = [] + if isinstance(service_names, str): + service_names = [service_names] + cluster_status, handle = backend_utils.is_controller_up( + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, + stopped_message='All services should have terminated.') + if handle is None or handle.head_ip is None: + # The error message is already printed in + # backend_utils.is_controller_up + # TODO(zhwu): Move the error message into the exception. + with ux_utils.print_exception_no_traceback(): + raise exceptions.ClusterNotUpError(message='', + cluster_status=cluster_status) + + service_names_str = ','.join(service_names) + if sum([len(service_names) > 0, all]) != 1: + argument_str = f'service_names={service_names_str}' if len( + service_names) > 0 else '' + argument_str += ' all' if all else '' + raise ValueError('Can only specify one of service_names or all. ' + f'Provided {argument_str!r}.') + + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + service_names = None if all else service_names + code = serve_utils.ServeCodeGen.terminate_services(service_names, purge) + + try: + returncode, stdout, _ = backend.run_on_head(handle, + code, + require_outputs=True, + stream_logs=False) + except exceptions.FetchIPError as e: + raise RuntimeError( + 'Failed to fetch controller IP. Please refresh controller status ' + f'by `sky status -r {serve_utils.SKY_SERVE_CONTROLLER_NAME}` ' + 'and try again.') from e + + try: + subprocess_utils.handle_returncode(returncode, code, + 'Failed to terminate service', + stdout) + except exceptions.CommandError as e: + raise RuntimeError(e.error_msg) from e + + sky_logging.print(stdout) + + +@usage_lib.entrypoint +def status( + service_names: Optional[Union[str, + List[str]]] = None) -> List[Dict[str, Any]]: + """Get service statuses. + + If service_names is given, return those services. Otherwise, return all + services. + + Each returned value has the following fields: + + .. code-block:: python + + { + 'name': (str) service name, + 'controller_job_id': (int) the job id of the controller, + 'uptime': (int) uptime in seconds, + 'status': (sky.ServiceStatus) service status, + 'controller_port': (Optional[int]) controller port, + 'load_balancer_port': (Optional[int]) load balancer port, + 'policy': (Optional[str]) load balancer policy description, + 'auto_restart': (bool) whether the service replica will be + auto-restarted, + 'requested_resources': (sky.Resources) requested resources + for replica, + 'replica_info': (List[Dict[str, Any]]) replica information, + } + + Each entry in replica_info has the following fields: + + .. code-block:: python + + { + 'replica_id': (int) replica id, + 'name': (str) replica name, + 'status': (sky.serve.ReplicaStatus) replica status, + 'launched_at': (int) timestamp of launched, + 'handle': (ResourceHandle) handle of the replica cluster, + } + + For possible service statuses and replica statuses, please refer to + sky.cli.serve_status. + + Args: + service_names: a single or a list of service names to query. If None, + query all services. + + Returns: + A list of dicts, with each dict containing the information of a service. + If a service is not found, it will be omitted from the returned list. + + Raises: + RuntimeError: if failed to get the service status. + exceptions.ClusterNotUpError: if the sky serve controller is not up. + """ + # Import here to avoid circular import. + # pylint: disable=import-outside-toplevel + from sky.utils import controller_utils + if service_names is not None: + if isinstance(service_names, str): + service_names = [service_names] + + try: + backend_utils.check_network_connection() + except exceptions.NetworkError as e: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'Failed to refresh service status due to network error.') from e + + # TODO(tian): This is so slow... It will take ~10s to refresh the status + # of controller. Can we optimize this? + controller_status, handle = backend_utils.is_controller_up( + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, + stopped_message='No service is found.') + + if handle is None or handle.head_ip is None: + # When the controller is STOPPED, the head_ip will be None, as + # it will be set in global_user_state.remove_cluster(). + # We do not directly check for UP because the controller may be + # in INIT state during another `sky serve up`, but still have + # head_ip available. In this case, we can still try to ssh + # into the controller and fetch the job table. + raise exceptions.ClusterNotUpError('Sky serve controller is not up.', + cluster_status=controller_status) + + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend) + + code = serve_utils.ServeCodeGen.get_service_status(service_names) + returncode, serve_status_payload, stderr = backend.run_on_head( + handle, + code, + require_outputs=True, + stream_logs=False, + separate_stderr=True) + + try: + subprocess_utils.handle_returncode(returncode, + code, + 'Failed to fetch services', + stderr, + stream_logs=True) + except exceptions.CommandError as e: + raise RuntimeError(e.error_msg) from e + + return serve_utils.load_service_status(serve_status_payload) + + +@usage_lib.entrypoint +def tail_logs( + service_name: str, + *, + target: Union[str, serve_utils.ServiceComponent], + replica_id: Optional[int] = None, + follow: bool = True, +) -> None: + """Tail logs for a service. + + Usage: + sky.serve.tail_logs( + service_name, + target=, + follow=False, # Optionally, default to True + # replica_id=3, # Must be specified when target is REPLICA. + ) + + `target` is a enum of sky.serve.ServiceComponent, which can be one of: + - CONTROLLER + - LOAD_BALANCER + - REPLICA + Pass target as a lower-case string is also supported, e.g. + target='controller'. + To use REPLICA, you must specify `replica_id`. + + To tail controller logs: + # follow default to True + sky.serve.tail_logs( + service_name, target=sky.serve.ServiceComponent.CONTROLLER) + + To print replica 3 logs: + # Pass target as a lower-case string is also supported. + sky.serve.tail_logs( + service_name, target='replica', + follow=False, replica_id=3) + + Raises: + sky.exceptions.ClusterNotUpError: the sky serve controller is not up. + ValueError: arguments not valid, or failed to tail the logs. + """ + # Import here to avoid circular import. + # pylint: disable=import-outside-toplevel + from sky.utils import controller_utils + if isinstance(target, str): + target = serve_utils.ServiceComponent(target) + if not isinstance(target, serve_utils.ServiceComponent): + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'`target` must be a string or ' + f'sky.serve.ServiceComponent, got {type(target)}.') + if target == serve_utils.ServiceComponent.REPLICA: + if replica_id is None: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + '`replica_id` must be specified when using target=REPLICA.') + else: + if replica_id is not None: + with ux_utils.print_exception_no_traceback(): + raise ValueError('`replica_id` must be None when using ' + 'target=CONTROLLER/LOAD_BALANCER.') + controller_status, handle = backend_utils.is_controller_up( + controller_type=controller_utils.Controllers.SKY_SERVE_CONTROLLER, + stopped_message='No service is found.') + if handle is None or handle.head_ip is None: + msg = 'No service is found.' + if controller_status == status_lib.ClusterStatus.INIT: + msg = '' + raise exceptions.ClusterNotUpError(msg, + cluster_status=controller_status) + backend = backend_utils.get_backend_from_handle(handle) + assert isinstance(backend, backends.CloudVmRayBackend), backend + backend.tail_serve_logs(handle, + service_name, + target, + replica_id, + follow=follow) diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index edaf9979354..b0a09abe1b3 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -397,13 +397,13 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str: def wait_service_initialization(service_name: str, job_id: int) -> str: - """Util function to call at the end of `sky.serve_up()`. + """Util function to call at the end of `sky.serve.up()`. This function will: (1) Check the name duplication by job id of the controller. If the job id is not the same as the database record, this means another service is already taken that name. See - sky/execution.py::serve_up for more details. + sky/serve/api.py::up for more details. (2) Wait for the load balancer port to be assigned and return. Returns: diff --git a/sky/serve/service.py b/sky/serve/service.py index d81e1364037..1b2aaf253c0 100644 --- a/sky/serve/service.py +++ b/sky/serve/service.py @@ -147,7 +147,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int): auto_restart=service_spec.auto_restart, requested_resources=requested_resources, status=serve_state.ServiceStatus.CONTROLLER_INIT) - # Directly throw an error here. See sky/execution.py::serve_up + # Directly throw an error here. See sky/serve/api.py::up # for more details. if not success: _cleanup_storage(tmp_task_yaml) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index a3278365b95..ccb07221e7b 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -32,7 +32,7 @@ # controller and sky serve controller. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 -# Message thrown when APIs sky.{spot_launch,serve_up}() received an invalid +# Message thrown when APIs sky.spot_launch(),sky.serve.up() received an invalid # controller resources spec. CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = ( '{controller_type} controller resources is not valid, please check ' From e87042a4c859770a31c625371459b230e5d3c8ad Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 14 Nov 2023 10:54:36 -0800 Subject: [PATCH 216/223] resolve controller_utils circular import --- sky/serve/api.py | 11 +---------- sky/utils/cli_utils/status_utils.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/sky/serve/api.py b/sky/serve/api.py index 2505878f357..f2da876450f 100644 --- a/sky/serve/api.py +++ b/sky/serve/api.py @@ -18,6 +18,7 @@ from sky.skylet import constants from sky.usage import usage_lib from sky.utils import common_utils +from sky.utils import controller_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import ux_utils @@ -40,7 +41,6 @@ def up( # pylint: disable=import-outside-toplevel from sky import execution from sky import task as task_lib - from sky.utils import controller_utils if service_name is None: service_name = serve_utils.generate_service_name() @@ -272,9 +272,6 @@ def down( ValueError: if the arguments are invalid. RuntimeError: if failed to terminate the service. """ - # Import here to avoid circular import. - # pylint: disable=import-outside-toplevel - from sky.utils import controller_utils if service_names is None: service_names = [] if isinstance(service_names, str): @@ -379,9 +376,6 @@ def status( RuntimeError: if failed to get the service status. exceptions.ClusterNotUpError: if the sky serve controller is not up. """ - # Import here to avoid circular import. - # pylint: disable=import-outside-toplevel - from sky.utils import controller_utils if service_names is not None: if isinstance(service_names, str): service_names = [service_names] @@ -473,9 +467,6 @@ def tail_logs( sky.exceptions.ClusterNotUpError: the sky serve controller is not up. ValueError: arguments not valid, or failed to tail the logs. """ - # Import here to avoid circular import. - # pylint: disable=import-outside-toplevel - from sky.utils import controller_utils if isinstance(target, str): target = serve_utils.ServiceComponent(target) if not isinstance(target, serve_utils.ServiceComponent): diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index dc8bec5c237..26d26bf0191 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -1,5 +1,6 @@ """Utilities for sky status.""" import re +import typing from typing import Any, Callable, Dict, List, Optional import click @@ -7,13 +8,15 @@ from sky import backends from sky import global_user_state -from sky import serve from sky import status_lib from sky.backends import backend_utils from sky.utils import common_utils from sky.utils import controller_utils from sky.utils import log_utils +if typing.TYPE_CHECKING: + from sky import serve + COMMAND_TRUNC_LENGTH = 25 REPLICA_TRUNC_NUM = 10 NUM_COST_REPORT_LINES = 5 @@ -114,6 +117,7 @@ def show_status_table(cluster_records: List[_ClusterRecord], return num_pending_autostop +# TODO(tian): Refactor to sky/serve. def format_service_table(service_records: List[_ServiceRecord], show_all: bool) -> str: if not service_records: @@ -410,6 +414,9 @@ def _get_uptime(service_record: _ServiceRecord) -> str: def _get_replicas(service_record: _ServiceRecord) -> str: + # Import here to avoid circular import + from sky import serve # pylint: disable=import-outside-toplevel + ready_replica_num, total_replica_num = 0, 0 for info in service_record['replica_info']: if _get_status(info) == serve.ReplicaStatus.READY: @@ -422,6 +429,9 @@ def _get_replicas(service_record: _ServiceRecord) -> str: def get_endpoint(service_record: _ServiceRecord) -> str: + # Import here to avoid circular import + from sky import serve # pylint: disable=import-outside-toplevel + # Don't use backend_utils.is_controller_up since it is too slow. handle = global_user_state.get_handle_from_cluster_name( serve.SKY_SERVE_CONTROLLER_NAME) @@ -434,7 +444,8 @@ def get_endpoint(service_record: _ServiceRecord) -> str: return f'{handle.head_ip}:{load_balancer_port}' -def _get_service_status(service_record: _ServiceRecord) -> serve.ServiceStatus: +def _get_service_status( + service_record: _ServiceRecord) -> 'serve.ServiceStatus': return service_record['status'] From a276a8f6006fad7e3743b8d4f3ab541c53d46a7c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 14 Nov 2023 14:18:31 -0800 Subject: [PATCH 217/223] fix spot config --- sky/execution.py | 8 +++++--- sky/templates/spot-controller.yaml.j2 | 7 +++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index cc2ab11711d..e157215eaf2 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -663,11 +663,14 @@ def spot_launch( mode='w') as f: dag_utils.dump_chain_dag_to_yaml(dag, f.name) controller_name = spot.SPOT_CONTROLLER_NAME + prefix = spot.SPOT_TASK_YAML_PREFIX + remote_user_yaml_path = f'{prefix}/{dag.name}-{dag_uuid}.yaml' + remote_user_config_path = f'{prefix}/{dag.name}-{dag_uuid}.config_yaml' extra_vars, controller_resources_config = ( controller_utils.skypilot_config_setup( controller_type='spot', controller_resources_config=spot.constants.CONTROLLER_RESOURCES, - remote_user_config_path=f'{dag.name}-{dag_uuid}.config_yaml')) + remote_user_config_path=remote_user_config_path)) try: controller_resources = sky.Resources.from_yaml_config( controller_resources_config) @@ -679,12 +682,11 @@ def spot_launch( err=common_utils.format_exception( e, use_bracket=True))) from e vars_to_fill = { - 'remote_user_yaml_prefix': spot.SPOT_TASK_YAML_PREFIX, + 'remote_user_yaml_path': remote_user_yaml_path, 'user_yaml_path': f.name, 'spot_controller': controller_name, # Note: actual spot cluster name will be - 'dag_name': dag.name, - 'uuid': dag_uuid, 'google_sdk_installation_commands': gcp.GOOGLE_SDK_INSTALLATION_COMMAND, 'retry_until_up': retry_until_up, diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index 5bc892e8365..746322fd956 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -3,9 +3,9 @@ name: {{dag_name}} file_mounts: - {{remote_user_yaml_prefix}}/{{dag_name}}-{{uuid}}.yaml: {{user_yaml_path}} + {{remote_user_yaml_path}}: {{user_yaml_path}} {% if user_config_path is not none %} - {{remote_user_yaml_prefix}}/{{remote_user_config_path}}: {{user_config_path}} + {{remote_user_config_path}}: {{user_config_path}} {% endif %} setup: | @@ -33,8 +33,7 @@ setup: | ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.spot.dashboard.dashboard") || (nohup python3 -m sky.spot.dashboard.dashboard >> ~/.sky/spot-dashboard.log 2>&1 &)); run: | - python -u -m sky.spot.controller \ - {{remote_user_yaml_prefix}}/{{dag_name}}-{{uuid}}.yaml \ + python -u -m sky.spot.controller {{remote_user_yaml_path}} \ --job-id $SKYPILOT_INTERNAL_JOB_ID {% if retry_until_up %}--retry-until-up{% endif %} envs: From 68891191b29ad9bc5feb9f2c71d8f72f49dd5e51 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 14 Nov 2023 15:52:10 -0800 Subject: [PATCH 218/223] minor ux --- sky/cli.py | 6 +++--- sky/serve/load_balancer.py | 2 +- sky/serve/serve_utils.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 2aaf75c4212..cd737db6dc8 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4511,13 +4511,13 @@ def serve_logs( .. code-block:: bash # Tail the controller logs of a service - sky serve logs --controller [SERVICE_ID] + sky serve logs --controller [SERVICE_NAME] \b # Print the load balancer logs so far and exit - sky serve logs --load-balancer --no-follow [SERVICE_ID] + sky serve logs --load-balancer --no-follow [SERVICE_NAME] \b # Tail the logs of replica 1 - sky serve logs [SERVICE_ID] 1 + sky serve logs [SERVICE_NAME] 1 """ have_replica_id = replica_id is not None num_flags = (controller + load_balancer + have_replica_id) diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py index 28722a72db9..01c0ff6d028 100644 --- a/sky/serve/load_balancer.py +++ b/sky/serve/load_balancer.py @@ -84,7 +84,7 @@ async def _redirect_handler(self, request: fastapi.Request): if ready_replica_url is None: raise fastapi.HTTPException(status_code=503, detail='No available replicas. ' - 'Use "sky serve status [SERVICE_ID]" ' + 'Use "sky serve status [SERVICE_NAME]" ' 'to check the replica status.') path = f'http://{ready_replica_url}{request.url.path}' diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index b0a09abe1b3..710346dbb18 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -38,9 +38,10 @@ _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' _SKYPILOT_LOG_PATTERN = r'.*tail -n100 -f (.*\.log).*' +# TODO(tian): Find all existing replica id and print here. _FAILED_TO_FIND_REPLICA_MSG = ( f'{colorama.Fore.RED}Failed to find replica ' - '{replica_id}. Please use `sky serve status [SERVICE_ID]`' + '{replica_id}. Please use `sky serve status [SERVICE_NAME]`' f' to check all valid replica id.{colorama.Style.RESET_ALL}') From 85a1bb076c6d7b0bfda6369c2db454d33a48b262 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 14 Nov 2023 16:50:52 -0800 Subject: [PATCH 219/223] add todo for default argument for sky serve logs --- sky/cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/cli.py b/sky/cli.py index cd737db6dc8..d85968408eb 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -4497,6 +4497,8 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): @click.argument('service_name', required=True, type=str) @click.argument('replica_id', required=False, type=int) @usage_lib.entrypoint +# TODO(tian): Add default argument for this CLI if none of the flags are +# specified. def serve_logs( service_name: str, follow: bool, From bcc05baecb05ea5bd8f1b783027e474203c13840 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 14 Nov 2023 17:15:18 -0800 Subject: [PATCH 220/223] resolve circular import. --- sky/core.py | 2 +- sky/execution.py | 3 ++- sky/serve/api.py | 8 +++----- sky/skylet/constants.py | 5 +++++ sky/task.py | 10 +++++----- sky/utils/cli_utils/status_utils.py | 5 ++--- sky/utils/controller_utils.py | 20 ++++++++------------ sky/utils/tpu_utils.py | 13 ++++++++----- 8 files changed, 34 insertions(+), 32 deletions(-) diff --git a/sky/core.py b/sky/core.py index 2488320c058..862856abc79 100644 --- a/sky/core.py +++ b/sky/core.py @@ -195,7 +195,7 @@ def _start( 'fix: omit the `idle_minutes_to_autostop` argument to use the ' f'default autostop settings (got: {idle_minutes_to_autostop}).') idle_minutes_to_autostop = ( - controller_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) + constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) # NOTE: if spot_queue() calls _start() and hits here, that entrypoint # would have a cluster name (the controller) filled in. diff --git a/sky/execution.py b/sky/execution.py index e157215eaf2..2612b634d8d 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -21,6 +21,7 @@ from sky import task as task_lib from sky.backends import backend_utils from sky.clouds import gcp +from sky.skylet import constants from sky.usage import usage_lib from sky.utils import common_utils from sky.utils import controller_utils @@ -720,7 +721,7 @@ def spot_launch( stream_logs=stream_logs, cluster_name=controller_name, detach_run=detach_run, - idle_minutes_to_autostop=controller_utils. + idle_minutes_to_autostop=constants. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) diff --git a/sky/serve/api.py b/sky/serve/api.py index f2da876450f..0a32a549e3a 100644 --- a/sky/serve/api.py +++ b/sky/serve/api.py @@ -8,9 +8,11 @@ import sky from sky import backends from sky import exceptions +from sky import execution from sky import global_user_state from sky import sky_logging from sky import status_lib +from sky import task as task_lib from sky.backends import backend_utils from sky.clouds import gcp from sky.serve import constants as serve_constants @@ -37,10 +39,6 @@ def up( task: sky.Task to serve up. service_name: Name of the service. """ - # Import here to avoid circular import. - # pylint: disable=import-outside-toplevel - from sky import execution - from sky import task as task_lib if service_name is None: service_name = serve_utils.generate_service_name() @@ -152,7 +150,7 @@ def up( stream_logs=False, cluster_name=controller_name, detach_run=True, - idle_minutes_to_autostop=controller_utils. + idle_minutes_to_autostop=constants. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, ) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 52f3fd9c600..6e49545d3d4 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -104,3 +104,8 @@ FILE_MOUNTS_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}' FILE_MOUNTS_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}' FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files' + +# The default idle timeout for SkyPilot controllers. This include spot +# controller and sky serve controller. +# TODO(tian): Refactor to controller_utils. Current blocker: circular import. +CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 diff --git a/sky/task.py b/sky/task.py index 93bb1c086c4..8d45679c707 100644 --- a/sky/task.py +++ b/sky/task.py @@ -14,13 +14,13 @@ from sky import clouds from sky import exceptions from sky import global_user_state -from sky import serve as serve_lib from sky import sky_logging from sky.backends import backend_utils import sky.dag from sky.data import data_utils from sky.data import storage as storage_lib from sky.provision import docker_utils +from sky.serve import service_spec from sky.skylet import constants from sky.utils import common_utils from sky.utils import schemas @@ -257,7 +257,7 @@ def __init__( # Default to CPUNode self.resources: Union[List[sky.Resources], Set[sky.Resources]] = {sky.Resources()} - self._service: Optional[serve_lib.SkyServiceSpec] = None + self._service: Optional[service_spec.SkyServiceSpec] = None # Resources that this task cannot run on. self.blocked_resources = blocked_resources @@ -496,7 +496,7 @@ def from_yaml_config( service = config.pop('service', None) if service is not None: - service = serve_lib.SkyServiceSpec.from_yaml_config(service) + service = service_spec.SkyServiceSpec.from_yaml_config(service) task.set_service(service) assert not config, f'Invalid task args: {config.keys()}' @@ -676,11 +676,11 @@ def set_resources_override(self, override_params: Dict[str, Any]) -> 'Task': return self @property - def service(self) -> Optional[serve_lib.SkyServiceSpec]: + def service(self) -> Optional[service_spec.SkyServiceSpec]: return self._service def set_service(self, - service: Optional[serve_lib.SkyServiceSpec]) -> 'Task': + service: Optional[service_spec.SkyServiceSpec]) -> 'Task': """Sets the service spec for this task. Args: diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 26d26bf0191..0af064323be 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -10,8 +10,8 @@ from sky import global_user_state from sky import status_lib from sky.backends import backend_utils +from sky.skylet import constants from sky.utils import common_utils -from sky.utils import controller_utils from sky.utils import log_utils if typing.TYPE_CHECKING: @@ -278,8 +278,7 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord], if cluster_records: if controller_name is not None: - autostop_minutes = ( - controller_utils.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) + autostop_minutes = (constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'{controller_name}{colorama.Style.RESET_ALL}' f'{colorama.Style.DIM} (will be autostopped if idle for ' diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index ccb07221e7b..4cc74ede590 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -11,13 +11,13 @@ import colorama from sky import exceptions -from sky import serve as serve_lib from sky import sky_logging from sky import skypilot_config -from sky import spot as spot_lib from sky.data import data_utils from sky.data import storage as storage_lib +from sky.serve import serve_utils from sky.skylet import constants +from sky.spot import spot_utils from sky.utils import common_utils from sky.utils import env_options from sky.utils import ux_utils @@ -28,10 +28,6 @@ logger = sky_logging.init_logger(__name__) -# The default idle timeout for skypilot controllers. This include spot -# controller and sky serve controller. -CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10 - # Message thrown when APIs sky.spot_launch(),sky.serve.up() received an invalid # controller resources spec. CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = ( @@ -60,7 +56,7 @@ class Controllers(enum.Enum): # sky/cli.py::_CONTROLLER_TO_HINT_OR_RAISE SPOT_CONTROLLER = _ControllerSpec( name='managed spot controller', - cluster_name=spot_lib.SPOT_CONTROLLER_NAME, + cluster_name=spot_utils.SPOT_CONTROLLER_NAME, in_progress_hint=( '* {job_info}To see all spot jobs: ' f'{colorama.Style.BRIGHT}sky spot queue{colorama.Style.RESET_ALL}'), @@ -75,18 +71,18 @@ class Controllers(enum.Enum): 'guarantee that all the spot jobs are finished. Please wait ' 'until the spot controller is UP or fix it with ' f'{colorama.Style.BRIGHT}sky start ' - f'{spot_lib.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), + f'{spot_utils.SPOT_CONTROLLER_NAME}{colorama.Style.RESET_ALL}.'), decline_down_for_dirty_controller_hint=( f'{colorama.Fore.RED}In-progress spot jobs found. To avoid ' f'resource leakage, cancel all jobs first: {colorama.Style.BRIGHT}' f'sky spot cancel -a{colorama.Style.RESET_ALL}\n'), check_cluster_name_hint=( - f'Cluster {spot_lib.SPOT_CONTROLLER_NAME} is reserved for ' + f'Cluster {spot_utils.SPOT_CONTROLLER_NAME} is reserved for ' 'managed spot controller. '), default_hint_if_non_existent='No managed spot jobs are found.') SKY_SERVE_CONTROLLER = _ControllerSpec( name='sky serve controller', - cluster_name=serve_lib.SKY_SERVE_CONTROLLER_NAME, + cluster_name=serve_utils.SKY_SERVE_CONTROLLER_NAME, in_progress_hint=( f'* To see detailed service status: {colorama.Style.BRIGHT}' f'sky serve status -a{colorama.Style.RESET_ALL}'), @@ -99,7 +95,7 @@ class Controllers(enum.Enum): 'cannot guarantee that all the services are terminated. Please ' 'wait until the sky serve controller is UP or fix it with ' f'{colorama.Style.BRIGHT}sky start ' - f'{serve_lib.SKY_SERVE_CONTROLLER_NAME}' + f'{serve_utils.SKY_SERVE_CONTROLLER_NAME}' f'{colorama.Style.RESET_ALL}.'), decline_down_for_dirty_controller_hint=( f'{colorama.Fore.RED}Tearing down the sky serve controller is not ' @@ -108,7 +104,7 @@ class Controllers(enum.Enum): f'{colorama.Style.BRIGHT}sky serve down -a' f'{colorama.Style.RESET_ALL}.'), check_cluster_name_hint=( - f'Cluster {serve_lib.SKY_SERVE_CONTROLLER_NAME} is reserved for ' + f'Cluster {serve_utils.SKY_SERVE_CONTROLLER_NAME} is reserved for ' 'sky serve controller. '), default_hint_if_non_existent='No service is found.') diff --git a/sky/utils/tpu_utils.py b/sky/utils/tpu_utils.py index 369ef94e6a6..0592611d36b 100644 --- a/sky/utils/tpu_utils.py +++ b/sky/utils/tpu_utils.py @@ -1,36 +1,39 @@ """Utility functions for TPUs.""" import json import os +import typing from typing import Optional from packaging import version -from sky import resources as resources_lib from sky.skylet import log_lib from sky.utils import ux_utils +if typing.TYPE_CHECKING: + from sky import resources as resources_lib -def is_tpu(resources: Optional[resources_lib.Resources]) -> bool: + +def is_tpu(resources: Optional['resources_lib.Resources']) -> bool: if resources is None or resources.accelerators is None: return False acc, _ = list(resources.accelerators.items())[0] return acc.startswith('tpu') -def is_tpu_vm(resources: Optional[resources_lib.Resources]) -> bool: +def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool: if resources is None or resources.accelerator_args is None: return False return resources.accelerator_args.get('tpu_vm', False) -def is_tpu_vm_pod(resources: Optional[resources_lib.Resources]) -> bool: +def is_tpu_vm_pod(resources: Optional['resources_lib.Resources']) -> bool: if resources is None or not is_tpu_vm(resources): return False acc, _ = list(resources.accelerators.items())[0] return acc not in ['tpu-v2-8', 'tpu-v3-8', 'tpu-v4-8'] -def get_num_tpu_devices(resources: Optional[resources_lib.Resources]) -> int: +def get_num_tpu_devices(resources: Optional['resources_lib.Resources']) -> int: if resources is None or not is_tpu(resources): raise ValueError('resources must be a valid TPU resource.') acc, _ = list(resources.accelerators.items())[0] From e3f1f3395366605ae92f7318bff68df77cfa28ce Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 14 Nov 2023 18:22:19 -0800 Subject: [PATCH 221/223] fix all circular import --- sky/cli.py | 4 +- sky/serve/__init__.py | 2 + sky/serve/serve_utils.py | 136 +++++++++++++++++++ sky/utils/cli_utils/status_utils.py | 197 +--------------------------- sky/utils/resources_utils.py | 34 +++++ 5 files changed, 176 insertions(+), 197 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d85968408eb..a27e10f75b4 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1721,9 +1721,9 @@ def _get_services(service_names: Optional[List[str]], f'{service_num} service{plural} found. Please specify ' 'an existing service to show its endpoint. Usage: ' 'sky serve status --endpoint ') - msg = status_utils.get_endpoint(service_records[0]) + msg = serve_lib.get_endpoint(service_records[0]) else: - msg = status_utils.format_service_table(service_records, show_all) + msg = serve_lib.format_service_table(service_records, show_all) service_not_found_msg = '' if service_names is not None: for service_name in service_names: diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index 55ce2fada74..cd37299a385 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -11,8 +11,10 @@ from sky.serve.constants import SKYSERVE_METADATA_DIR from sky.serve.serve_state import ReplicaStatus from sky.serve.serve_state import ServiceStatus +from sky.serve.serve_utils import format_service_table from sky.serve.serve_utils import generate_replica_cluster_name from sky.serve.serve_utils import generate_service_name +from sky.serve.serve_utils import get_endpoint from sky.serve.serve_utils import ServeCodeGen from sky.serve.serve_utils import ServiceComponent from sky.serve.serve_utils import SKY_SERVE_CONTROLLER_NAME diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 710346dbb18..af0074309ba 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -26,6 +26,8 @@ from sky.serve import serve_state from sky.skylet import job_lib from sky.utils import common_utils +from sky.utils import log_utils +from sky.utils import resources_utils from sky.utils import ux_utils if typing.TYPE_CHECKING: @@ -43,6 +45,9 @@ f'{colorama.Fore.RED}Failed to find replica ' '{replica_id}. Please use `sky serve status [SERVICE_NAME]`' f' to check all valid replica id.{colorama.Style.RESET_ALL}') +# Max number of replicas to show in `sky serve status` by default. +# If user wants to see all replicas, use `sky serve status --all`. +_REPLICA_TRUNC_NUM = 10 class ServiceComponent(enum.Enum): @@ -623,6 +628,137 @@ def _service_is_terminal() -> bool: return '' +# ================== Table Formatter for `sky serve status` ================== + + +def _get_replicas(service_record: Dict[str, Any]) -> str: + ready_replica_num, total_replica_num = 0, 0 + for info in service_record['replica_info']: + if info['status'] == serve_state.ReplicaStatus.READY: + ready_replica_num += 1 + # If auto restart enabled, not count FAILED replicas here. + if (not service_record['auto_restart'] or + info['status'] != serve_state.ReplicaStatus.FAILED): + total_replica_num += 1 + return f'{ready_replica_num}/{total_replica_num}' + + +def get_endpoint(service_record: Dict[str, Any]) -> str: + # Don't use backend_utils.is_controller_up since it is too slow. + handle = global_user_state.get_handle_from_cluster_name( + SKY_SERVE_CONTROLLER_NAME) + assert isinstance(handle, backends.CloudVmRayResourceHandle) + if handle is None or handle.head_ip is None: + return '-' + load_balancer_port = service_record['load_balancer_port'] + if load_balancer_port is None: + return '-' + return f'{handle.head_ip}:{load_balancer_port}' + + +def format_service_table(service_records: List[Dict[str, Any]], + show_all: bool) -> str: + if not service_records: + return 'No existing services.' + + service_columns = ['NAME', 'UPTIME', 'STATUS', 'REPLICAS', 'ENDPOINT'] + if show_all: + service_columns.extend(['POLICY', 'REQUESTED_RESOURCES']) + service_table = log_utils.create_table(service_columns) + + replica_infos = [] + for record in service_records: + for replica in record['replica_info']: + replica['service_name'] = record['name'] + replica_infos.append(replica) + + service_name = record['name'] + uptime = log_utils.readable_time_duration(record['uptime'], + absolute=True) + service_status = record['status'] + status_str = service_status.colored_str() + replicas = _get_replicas(record) + endpoint = get_endpoint(record) + policy = record['policy'] + requested_resources = record['requested_resources'] + + service_values = [ + service_name, + uptime, + status_str, + replicas, + endpoint, + ] + if show_all: + service_values.extend([policy, requested_resources]) + service_table.add_row(service_values) + + replica_table = _format_replica_table(replica_infos, show_all) + return (f'{service_table}\n' + f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Service Replicas{colorama.Style.RESET_ALL}\n' + f'{replica_table}') + + +def _format_replica_table(replica_records: List[Dict[str, Any]], + show_all: bool) -> str: + if not replica_records: + return 'No existing replicas.' + + replica_columns = [ + 'SERVICE_NAME', 'ID', 'IP', 'LAUNCHED', 'RESOURCES', 'STATUS', 'REGION' + ] + if show_all: + replica_columns.append('ZONE') + replica_table = log_utils.create_table(replica_columns) + + truncate_hint = '' + if not show_all: + if len(replica_records) > _REPLICA_TRUNC_NUM: + truncate_hint = '\n... (use --all to show all replicas)' + replica_records = replica_records[:_REPLICA_TRUNC_NUM] + + for record in replica_records: + service_name = record['service_name'] + replica_id = record['replica_id'] + replica_ip = '-' + launched_at = log_utils.readable_time_duration(record['launched_at']) + resources_str = '-' + replica_status = record['status'] + status_str = replica_status.colored_str() + region = '-' + zone = '-' + + replica_handle: 'backends.CloudVmRayResourceHandle' = record['handle'] + if replica_handle is not None: + if replica_handle.head_ip is not None: + replica_ip = replica_handle.head_ip + resources_str = resources_utils.get_cloud_resources_str( + replica_handle, simplify=not show_all) + if replica_handle.launched_resources.region is not None: + region = replica_handle.launched_resources.region + if replica_handle.launched_resources.zone is not None: + zone = replica_handle.launched_resources.zone + + replica_values = [ + service_name, + replica_id, + replica_ip, + launched_at, + resources_str, + status_str, + region, + ] + if show_all: + replica_values.append(zone) + replica_table.add_row(replica_values) + + return f'{replica_table}{truncate_hint}' + + +# =========================== CodeGen for Sky Serve =========================== + + # TODO(tian): Use REST API instead of SSH in the future. This codegen pattern # is to reuse the authentication of ssh. If we want to use REST API, we need # to implement some authentication mechanism. diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 0af064323be..9279ea4d702 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -1,33 +1,24 @@ """Utilities for sky status.""" -import re -import typing from typing import Any, Callable, Dict, List, Optional import click import colorama from sky import backends -from sky import global_user_state from sky import status_lib from sky.backends import backend_utils from sky.skylet import constants from sky.utils import common_utils from sky.utils import log_utils - -if typing.TYPE_CHECKING: - from sky import serve +from sky.utils import resources_utils COMMAND_TRUNC_LENGTH = 25 -REPLICA_TRUNC_NUM = 10 NUM_COST_REPORT_LINES = 5 # A record in global_user_state's 'clusters' table. _ClusterRecord = Dict[str, Any] # A record returned by core.cost_report(); see its docstr for all fields. _ClusterCostReportRecord = Dict[str, Any] -# A record in serve_state's 'services' table. -_ServiceRecord = Dict[str, Any] -_ReplicaRecord = Dict[str, Any] def truncate_long_string(s: str, max_length: int = 35) -> str: @@ -117,87 +108,6 @@ def show_status_table(cluster_records: List[_ClusterRecord], return num_pending_autostop -# TODO(tian): Refactor to sky/serve. -def format_service_table(service_records: List[_ServiceRecord], - show_all: bool) -> str: - if not service_records: - return 'No existing services.' - - status_columns = [ - StatusColumn('NAME', _get_name), - StatusColumn('UPTIME', _get_uptime), - StatusColumn('STATUS', _get_service_status_colored), - StatusColumn('REPLICAS', _get_replicas), - StatusColumn('ENDPOINT', get_endpoint), - StatusColumn('POLICY', _get_policy, show_by_default=False), - StatusColumn('REQUESTED_RESOURCES', - _get_requested_resources, - show_by_default=False), - ] - - columns = [] - for status_column in status_columns: - if status_column.show_by_default or show_all: - columns.append(status_column.name) - service_table = log_utils.create_table(columns) - replica_infos = [] - for record in service_records: - row = [] - for status_column in status_columns: - if status_column.show_by_default or show_all: - row.append(status_column.calc(record)) - service_table.add_row(row) - for replica in record['replica_info']: - replica['service_name'] = record['name'] - replica_infos.append(replica) - - replica_table = format_replica_table(replica_infos, show_all) - return (f'{service_table}\n' - f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Service Replicas{colorama.Style.RESET_ALL}\n' - f'{replica_table}') - - -def format_replica_table(replica_records: List[_ReplicaRecord], - show_all: bool) -> str: - if not replica_records: - return 'No existing replicas.' - - status_columns = [ - StatusColumn('SERVICE_NAME', _get_service_name), - StatusColumn('ID', _get_replica_id), - StatusColumn('IP', _get_head_ip), - StatusColumn('LAUNCHED', _get_launched), - StatusColumn( - 'RESOURCES', - _get_full_replica_resources if show_all else _get_replica_resources, - trunc_length=70 if not show_all else 0), - StatusColumn('REGION', _get_replica_region), - StatusColumn('ZONE', _get_replica_zone, show_by_default=False), - StatusColumn('STATUS', _get_status_colored), - ] - - truncate_hint = '' - if not show_all: - if len(replica_records) > REPLICA_TRUNC_NUM: - truncate_hint = '\n... (use --all to show all replicas)' - replica_records = replica_records[:REPLICA_TRUNC_NUM] - - columns = [] - for status_column in status_columns: - if status_column.show_by_default or show_all: - columns.append(status_column.name) - replica_table = log_utils.create_table(columns) - for record in replica_records: - row = [] - for status_column in status_columns: - if status_column.show_by_default or show_all: - row.append(status_column.calc(record)) - replica_table.add_row(row) - - return f'{replica_table}{truncate_hint}' - - def get_total_cost_of_displayed_records( cluster_records: List[_ClusterCostReportRecord], display_all: bool): """Compute total cost of records to be displayed in cost report.""" @@ -398,58 +308,6 @@ def show_local_status_table(local_clusters: List[str]): _get_command = (lambda cluster_record: cluster_record['last_use']) _get_duration = (lambda cluster_record: log_utils.readable_time_duration( 0, cluster_record['duration'], absolute=True)) -_get_replica_id = lambda replica_record: replica_record['replica_id'] -_get_service_name = lambda replica_record: replica_record['service_name'] -_get_policy = lambda replica_record: replica_record['policy'] -_get_requested_resources = lambda replica_record: replica_record[ - 'requested_resources'] - - -def _get_uptime(service_record: _ServiceRecord) -> str: - uptime = service_record['uptime'] - if uptime is None: - return '-' - return log_utils.readable_time_duration(uptime, absolute=True) - - -def _get_replicas(service_record: _ServiceRecord) -> str: - # Import here to avoid circular import - from sky import serve # pylint: disable=import-outside-toplevel - - ready_replica_num, total_replica_num = 0, 0 - for info in service_record['replica_info']: - if _get_status(info) == serve.ReplicaStatus.READY: - ready_replica_num += 1 - # If auto restart enabled, not count FAILED replicas here. - if (not service_record['auto_restart'] or - _get_status(info) != serve.ReplicaStatus.FAILED): - total_replica_num += 1 - return f'{ready_replica_num}/{total_replica_num}' - - -def get_endpoint(service_record: _ServiceRecord) -> str: - # Import here to avoid circular import - from sky import serve # pylint: disable=import-outside-toplevel - - # Don't use backend_utils.is_controller_up since it is too slow. - handle = global_user_state.get_handle_from_cluster_name( - serve.SKY_SERVE_CONTROLLER_NAME) - assert isinstance(handle, backends.CloudVmRayResourceHandle) - if handle is None or handle.head_ip is None: - return '-' - load_balancer_port = service_record['load_balancer_port'] - if load_balancer_port is None: - return '-' - return f'{handle.head_ip}:{load_balancer_port}' - - -def _get_service_status( - service_record: _ServiceRecord) -> 'serve.ServiceStatus': - return service_record['status'] - - -def _get_service_status_colored(service_record: _ServiceRecord) -> str: - return _get_service_status(service_record).colored_str() def _get_status(cluster_record: _ClusterRecord) -> status_lib.ClusterStatus: @@ -462,23 +320,10 @@ def _get_status_colored(cluster_record: _ClusterRecord) -> str: def _get_resources(cluster_record: _ClusterRecord) -> str: handle = cluster_record['handle'] - resources_str = '' if isinstance(handle, backends.LocalDockerResourceHandle): resources_str = 'docker' elif isinstance(handle, backends.CloudVmRayResourceHandle): - if (handle.launched_nodes is not None and - handle.launched_resources is not None): - launched_resource_str = str(handle.launched_resources) - # accelerator_args is way too long. - # Convert from: - # GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long - # to: - # GCP(n1-highmem-8, {'tpu-v2-8': 1}...) - pattern = ', accelerator_args={.*}' - launched_resource_str = re.sub(pattern, '...', - launched_resource_str) - resources_str = (f'{handle.launched_nodes}x ' - f'{launched_resource_str}') + resources_str = resources_utils.get_cloud_resources_str(handle) else: raise ValueError(f'Unknown handle type {type(handle)} encountered.') return resources_str @@ -491,44 +336,6 @@ def _get_zone(cluster_record: _ClusterRecord) -> str: return zone_str -def _get_full_replica_resources(replica_record: _ReplicaRecord) -> str: - handle = replica_record['handle'] - if handle is None: - return '-' - return _get_resources(replica_record) - - -def _get_replica_resources(replica_record: _ReplicaRecord) -> str: - handle = replica_record['handle'] - if handle is None: - return '-' - assert isinstance(handle, backends.CloudVmRayResourceHandle) - cloud = handle.launched_resources.cloud - if handle.launched_resources.accelerators is None: - vcpu, _ = cloud.get_vcpus_mem_from_instance_type( - handle.launched_resources.instance_type) - hardware = f'vCPU={int(vcpu)}' - else: - hardware = f'{handle.launched_resources.accelerators})' - spot = '[Spot]' if handle.launched_resources.use_spot else '' - resources_str = f'{handle.launched_nodes}x {cloud}({spot}{hardware})' - return resources_str - - -def _get_replica_region(replica_record: _ReplicaRecord) -> str: - handle = replica_record['handle'] - if handle is None: - return '-' - return _get_region(replica_record) - - -def _get_replica_zone(replica_record: _ReplicaRecord) -> str: - handle = replica_record['handle'] - if handle is None: - return '-' - return _get_zone(replica_record) - - def _get_autostop(cluster_record: _ClusterRecord) -> str: autostop_str = '' separation = '' diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index d97518f6320..7edf95930a9 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -1,13 +1,19 @@ """Utility functions for resources.""" import itertools +import re +import typing from typing import List, Optional, Set from sky.utils import ux_utils +if typing.TYPE_CHECKING: + from sky import backends + _PORT_RANGE_HINT_MSG = ('Invalid port range {}. Please use the format ' '"from-to", in which from <= to. e.g. "1-3".') _PORT_HINT_MSG = ('Invalid port {}. ' 'Please use a port number between 1 and 65535.') +_DEFAULT_MESSAGE_HANDLE_INITIALIZING = '' def check_port_str(port: str) -> None: @@ -83,3 +89,31 @@ def simplify_ports(ports: List[str]) -> List[str]: For example, ['1-2', '3', '5-6', '7'] will be simplified to ['1-3', '5-7']. """ return port_set_to_ranges(port_ranges_to_set(ports)) + + +def get_cloud_resources_str(handle: 'backends.CloudVmRayResourceHandle', + simplify: bool = False) -> str: + if (handle.launched_nodes is not None and + handle.launched_resources is not None): + if simplify: + cloud = handle.launched_resources.cloud + if handle.launched_resources.accelerators is None: + vcpu, _ = cloud.get_vcpus_mem_from_instance_type( + handle.launched_resources.instance_type) + hardware = f'vCPU={int(vcpu)}' + else: + hardware = f'{handle.launched_resources.accelerators})' + spot = '[Spot]' if handle.launched_resources.use_spot else '' + return f'{handle.launched_nodes}x {cloud}({spot}{hardware})' + else: + launched_resource_str = str(handle.launched_resources) + # accelerator_args is way too long. + # Convert from: + # GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long + # to: + # GCP(n1-highmem-8, {'tpu-v2-8': 1}...) + pattern = ', accelerator_args={.*}' + launched_resource_str = re.sub(pattern, '...', + launched_resource_str) + return f'{handle.launched_nodes}x {launched_resource_str}' + return _DEFAULT_MESSAGE_HANDLE_INITIALIZING From e91b60e8863495fd351204423cff7c4481237a65 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 14 Nov 2023 18:23:23 -0800 Subject: [PATCH 222/223] minor --- sky/utils/cli_utils/status_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 9279ea4d702..ebdf8cdbedc 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -188,7 +188,7 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord], if cluster_records: if controller_name is not None: - autostop_minutes = (constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP) + autostop_minutes = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'{controller_name}{colorama.Style.RESET_ALL}' f'{colorama.Style.DIM} (will be autostopped if idle for ' From a60be5a63c0838f8658c53b8f5534d4c9d54da9c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 15 Nov 2023 09:25:38 -0800 Subject: [PATCH 223/223] apply suggestion from code review --- sky/cli.py | 10 +++++----- sky/serve/__init__.py | 8 ++++---- sky/serve/{api.py => core.py} | 2 +- sky/serve/serve_utils.py | 2 +- sky/utils/cli_utils/status_utils.py | 2 +- sky/utils/resources_utils.py | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) rename sky/serve/{api.py => core.py} (99%) diff --git a/sky/cli.py b/sky/cli.py index a27e10f75b4..1c40099ea90 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1692,7 +1692,7 @@ def _get_services(service_names: Optional[List[str]], if not service_names: # Change empty list to None service_names = None - service_records = sky.serve.status(service_names) + service_records = serve_lib.status(service_names) num_services = len(service_records) except exceptions.ClusterNotUpError as e: controller_status = e.cluster_status @@ -2863,7 +2863,7 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): with rich_utils.safe_status( '[bold cyan]Checking for running services[/]'): try: - services = sky.serve.status() + services = serve_lib.status() except exceptions.ClusterNotUpError: cluster_status = backend_utils.refresh_cluster_status_handle( controller_name) @@ -4282,7 +4282,7 @@ def serve_up( if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) - sky.serve.up(task, service_name) + serve_lib.up(task, service_name) @serve.command('status', cls=_DocumentedCodeCommand) @@ -4474,7 +4474,7 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): abort=True, show_default=True) - sky.serve.down(service_names=service_names, all=all, purge=purge) + serve_lib.down(service_names=service_names, all=all, purge=purge) @serve.command('logs', cls=_DocumentedCodeCommand) @@ -4538,7 +4538,7 @@ def serve_logs( assert replica_id is not None target_component = serve_lib.ServiceComponent.REPLICA try: - sky.serve.tail_logs(service_name, + serve_lib.tail_logs(service_name, target=target_component, replica_id=replica_id, follow=follow) diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index cd37299a385..19e0df64d2e 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -1,14 +1,14 @@ """Modules for SkyServe services.""" import os -from sky.serve.api import down -from sky.serve.api import status -from sky.serve.api import tail_logs -from sky.serve.api import up from sky.serve.constants import ENDPOINT_PROBE_INTERVAL_SECONDS from sky.serve.constants import LB_CONTROLLER_SYNC_INTERVAL_SECONDS from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.constants import SKYSERVE_METADATA_DIR +from sky.serve.core import down +from sky.serve.core import status +from sky.serve.core import tail_logs +from sky.serve.core import up from sky.serve.serve_state import ReplicaStatus from sky.serve.serve_state import ServiceStatus from sky.serve.serve_utils import format_service_table diff --git a/sky/serve/api.py b/sky/serve/core.py similarity index 99% rename from sky/serve/api.py rename to sky/serve/core.py index 0a32a549e3a..44ae04c4c02 100644 --- a/sky/serve/api.py +++ b/sky/serve/core.py @@ -139,7 +139,7 @@ def up( # controller to check name conflict. Suppose we have multiple # sky.serve.up() with same service name, the first one will # successfully write its job id to controller service database; - # and for all following sky.serve.up, the controller will throw + # and for all following sky.serve.up(), the controller will throw # an exception (name conflict detected) and exit. Therefore the # controller job id in database could be use as an indicator of # whether the service is already running. If the id is the same diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index af0074309ba..3f9e8a0888c 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -733,7 +733,7 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], if replica_handle is not None: if replica_handle.head_ip is not None: replica_ip = replica_handle.head_ip - resources_str = resources_utils.get_cloud_resources_str( + resources_str = resources_utils.get_readable_resources_repr( replica_handle, simplify=not show_all) if replica_handle.launched_resources.region is not None: region = replica_handle.launched_resources.region diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index ebdf8cdbedc..af433d4e78b 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -323,7 +323,7 @@ def _get_resources(cluster_record: _ClusterRecord) -> str: if isinstance(handle, backends.LocalDockerResourceHandle): resources_str = 'docker' elif isinstance(handle, backends.CloudVmRayResourceHandle): - resources_str = resources_utils.get_cloud_resources_str(handle) + resources_str = resources_utils.get_readable_resources_repr(handle) else: raise ValueError(f'Unknown handle type {type(handle)} encountered.') return resources_str diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index 7edf95930a9..04050ca08cd 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -91,8 +91,8 @@ def simplify_ports(ports: List[str]) -> List[str]: return port_set_to_ranges(port_ranges_to_set(ports)) -def get_cloud_resources_str(handle: 'backends.CloudVmRayResourceHandle', - simplify: bool = False) -> str: +def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle', + simplify: bool = False) -> str: if (handle.launched_nodes is not None and handle.launched_resources is not None): if simplify: