diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index dcc93145f71..21786a00f6a 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -57,6 +57,7 @@ def _open_ports_using_ingress( ports: List[int], provider_config: Dict[str, Any], ) -> None: + # Check if an ingress controller exists if not network_utils.ingress_controller_exists(): raise Exception( 'Ingress controller not found. ' @@ -64,32 +65,47 @@ def _open_ports_using_ingress( 'https://github.com/kubernetes/ingress-nginx/blob/main/docs/deploy/index.md.' # pylint: disable=line-too-long ) - for port in ports: - service_name = f'{cluster_name_on_cloud}-skypilot-service--{port}' - ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress--{port}' - path_prefix = _PATH_PREFIX.format( - cluster_name_on_cloud=cluster_name_on_cloud, port=port) + # Prepare service names, ports, for template rendering + service_details = [ + (f'{cluster_name_on_cloud}-skypilot-service--{port}', port, + _PATH_PREFIX.format(cluster_name_on_cloud=cluster_name_on_cloud, + port=port).rstrip('/').lstrip('/')) + for port in ports + ] + + # Generate ingress and services specs + # We batch ingress rule creation because each rule triggers a hot reload of + # the nginx controller. If the ingress rules are created sequentially, + # it could lead to multiple reloads of the Nginx-Ingress-Controller within + # a brief period. Consequently, the Nginx-Controller pod might spawn an + # excessive number of sub-processes. This surge triggers Kubernetes to kill + # and restart the Nginx due to the podPidsLimit parameter, which is + # typically set to a default value like 1024. + # To avoid this, we change ingress creation into one object containing + # multiple rules. + content = network_utils.fill_ingress_template( + namespace=provider_config.get('namespace', 'default'), + service_details=service_details, + ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress', + selector_key='skypilot-cluster', + selector_value=cluster_name_on_cloud, + ) - content = network_utils.fill_ingress_template( - namespace=provider_config.get('namespace', 'default'), - path_prefix=path_prefix, - service_name=service_name, - service_port=port, - ingress_name=ingress_name, - selector_key='skypilot-cluster', - selector_value=cluster_name_on_cloud, - ) + # Create or update services based on the generated specs + for service_name, service_spec in content['services_spec'].items(): network_utils.create_or_replace_namespaced_service( namespace=provider_config.get('namespace', 'default'), service_name=service_name, - service_spec=content['service_spec'], - ) - network_utils.create_or_replace_namespaced_ingress( - namespace=provider_config.get('namespace', 'default'), - ingress_name=ingress_name, - ingress_spec=content['ingress_spec'], + service_spec=service_spec, ) + # Create or update the single ingress for all services + network_utils.create_or_replace_namespaced_ingress( + namespace=provider_config.get('namespace', 'default'), + ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress', + ingress_spec=content['ingress_spec'], + ) + def cleanup_ports( cluster_name_on_cloud: str, @@ -128,17 +144,20 @@ def _cleanup_ports_for_ingress( ports: List[int], provider_config: Dict[str, Any], ) -> None: + # Delete services for each port for port in ports: service_name = f'{cluster_name_on_cloud}-skypilot-service--{port}' - ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress--{port}' network_utils.delete_namespaced_service( namespace=provider_config.get('namespace', 'default'), service_name=service_name, ) - network_utils.delete_namespaced_ingress( - namespace=provider_config.get('namespace', 'default'), - ingress_name=ingress_name, - ) + + # Delete the single ingress used for all ports + ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress' + network_utils.delete_namespaced_ingress( + namespace=provider_config.get('namespace', 'default'), + ingress_name=ingress_name, + ) def query_ports( diff --git a/sky/provision/kubernetes/network_utils.py b/sky/provision/kubernetes/network_utils.py index db90d7f7f58..b5440125e5b 100644 --- a/sky/provision/kubernetes/network_utils.py +++ b/sky/provision/kubernetes/network_utils.py @@ -57,9 +57,10 @@ def fill_loadbalancer_template(namespace: str, service_name: str, return content -def fill_ingress_template(namespace: str, path_prefix: str, service_name: str, - service_port: int, ingress_name: str, - selector_key: str, selector_value: str) -> Dict: +def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int, + str]], + ingress_name: str, selector_key: str, + selector_value: str) -> Dict: template_path = os.path.join(sky.__root_dir__, 'templates', _INGRESS_TEMPLATE_NAME) if not os.path.exists(template_path): @@ -70,15 +71,22 @@ def fill_ingress_template(namespace: str, path_prefix: str, service_name: str, j2_template = jinja2.Template(template) cont = j2_template.render( namespace=namespace, - path_prefix=path_prefix.rstrip('/').lstrip('/'), - service_name=service_name, - service_port=service_port, + service_names_and_ports=[{ + 'service_name': name, + 'service_port': port, + 'path_prefix': path_prefix + } for name, port, path_prefix in service_details], ingress_name=ingress_name, selector_key=selector_key, selector_value=selector_value, ) content = yaml.safe_load(cont) - return content + + # Return a dictionary containing both specs + return { + 'ingress_spec': content['ingress_spec'], + 'services_spec': content['services_spec'] + } def create_or_replace_namespaced_ingress( diff --git a/sky/templates/kubernetes-ingress.yml.j2 b/sky/templates/kubernetes-ingress.yml.j2 index 1a1dcbcafc5..84b3e9d6998 100644 --- a/sky/templates/kubernetes-ingress.yml.j2 +++ b/sky/templates/kubernetes-ingress.yml.j2 @@ -12,24 +12,29 @@ ingress_spec: rules: - http: paths: - - path: /{{ path_prefix }}(/|$)(.*) + {% for service in service_names_and_ports %} + - path: /{{ service.path_prefix }}(/|$)(.*) pathType: ImplementationSpecific backend: service: - name: {{ service_name }} + name: {{ service.service_name }} port: - number: {{ service_port }} -service_spec: - apiVersion: v1 - kind: Service - metadata: - name: {{ service_name }} - labels: - parent: skypilot - spec: - type: ClusterIP - selector: - {{ selector_key }}: {{ selector_value }} - ports: - - port: {{ service_port }} - targetPort: {{ service_port }} + number: {{ service.service_port }} + {% endfor %} +services_spec: + {% for service in service_names_and_ports %} + {{ service.service_name }}: + apiVersion: v1 + kind: Service + metadata: + name: {{ service.service_name }} + labels: + parent: skypilot + spec: + type: ClusterIP + selector: + {{ selector_key }}: {{ selector_value }} + ports: + - port: {{ service.service_port }} + targetPort: {{ service.service_port }} + {% endfor %}