Skip to content

Commit

Permalink
[k8s] Optimize ingress creation to avoid nginx hot-reloads (#3263)
Browse files Browse the repository at this point in the history
* optimize ingress

* fix

* fix

* add comment
  • Loading branch information
romilbhardwaj authored Mar 15, 2024
1 parent 498d02c commit 823999a
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 49 deletions.
69 changes: 44 additions & 25 deletions sky/provision/kubernetes/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,39 +57,55 @@ def _open_ports_using_ingress(
ports: List[int],
provider_config: Dict[str, Any],
) -> None:
# Check if an ingress controller exists
if not network_utils.ingress_controller_exists():
raise Exception(
'Ingress controller not found. '
'Install Nginx ingress controller first: '
'https://github.com/kubernetes/ingress-nginx/blob/main/docs/deploy/index.md.' # pylint: disable=line-too-long
)

for port in ports:
service_name = f'{cluster_name_on_cloud}-skypilot-service--{port}'
ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress--{port}'
path_prefix = _PATH_PREFIX.format(
cluster_name_on_cloud=cluster_name_on_cloud, port=port)
# Prepare service names, ports, for template rendering
service_details = [
(f'{cluster_name_on_cloud}-skypilot-service--{port}', port,
_PATH_PREFIX.format(cluster_name_on_cloud=cluster_name_on_cloud,
port=port).rstrip('/').lstrip('/'))
for port in ports
]

# Generate ingress and services specs
# We batch ingress rule creation because each rule triggers a hot reload of
# the nginx controller. If the ingress rules are created sequentially,
# it could lead to multiple reloads of the Nginx-Ingress-Controller within
# a brief period. Consequently, the Nginx-Controller pod might spawn an
# excessive number of sub-processes. This surge triggers Kubernetes to kill
# and restart the Nginx due to the podPidsLimit parameter, which is
# typically set to a default value like 1024.
# To avoid this, we change ingress creation into one object containing
# multiple rules.
content = network_utils.fill_ingress_template(
namespace=provider_config.get('namespace', 'default'),
service_details=service_details,
ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
selector_key='skypilot-cluster',
selector_value=cluster_name_on_cloud,
)

content = network_utils.fill_ingress_template(
namespace=provider_config.get('namespace', 'default'),
path_prefix=path_prefix,
service_name=service_name,
service_port=port,
ingress_name=ingress_name,
selector_key='skypilot-cluster',
selector_value=cluster_name_on_cloud,
)
# Create or update services based on the generated specs
for service_name, service_spec in content['services_spec'].items():
network_utils.create_or_replace_namespaced_service(
namespace=provider_config.get('namespace', 'default'),
service_name=service_name,
service_spec=content['service_spec'],
)
network_utils.create_or_replace_namespaced_ingress(
namespace=provider_config.get('namespace', 'default'),
ingress_name=ingress_name,
ingress_spec=content['ingress_spec'],
service_spec=service_spec,
)

# Create or update the single ingress for all services
network_utils.create_or_replace_namespaced_ingress(
namespace=provider_config.get('namespace', 'default'),
ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
ingress_spec=content['ingress_spec'],
)


def cleanup_ports(
cluster_name_on_cloud: str,
Expand Down Expand Up @@ -128,17 +144,20 @@ def _cleanup_ports_for_ingress(
ports: List[int],
provider_config: Dict[str, Any],
) -> None:
# Delete services for each port
for port in ports:
service_name = f'{cluster_name_on_cloud}-skypilot-service--{port}'
ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress--{port}'
network_utils.delete_namespaced_service(
namespace=provider_config.get('namespace', 'default'),
service_name=service_name,
)
network_utils.delete_namespaced_ingress(
namespace=provider_config.get('namespace', 'default'),
ingress_name=ingress_name,
)

# Delete the single ingress used for all ports
ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
network_utils.delete_namespaced_ingress(
namespace=provider_config.get('namespace', 'default'),
ingress_name=ingress_name,
)


def query_ports(
Expand Down
22 changes: 15 additions & 7 deletions sky/provision/kubernetes/network_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,10 @@ def fill_loadbalancer_template(namespace: str, service_name: str,
return content


def fill_ingress_template(namespace: str, path_prefix: str, service_name: str,
service_port: int, ingress_name: str,
selector_key: str, selector_value: str) -> Dict:
def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
str]],
ingress_name: str, selector_key: str,
selector_value: str) -> Dict:
template_path = os.path.join(sky.__root_dir__, 'templates',
_INGRESS_TEMPLATE_NAME)
if not os.path.exists(template_path):
Expand All @@ -70,15 +71,22 @@ def fill_ingress_template(namespace: str, path_prefix: str, service_name: str,
j2_template = jinja2.Template(template)
cont = j2_template.render(
namespace=namespace,
path_prefix=path_prefix.rstrip('/').lstrip('/'),
service_name=service_name,
service_port=service_port,
service_names_and_ports=[{
'service_name': name,
'service_port': port,
'path_prefix': path_prefix
} for name, port, path_prefix in service_details],
ingress_name=ingress_name,
selector_key=selector_key,
selector_value=selector_value,
)
content = yaml.safe_load(cont)
return content

# Return a dictionary containing both specs
return {
'ingress_spec': content['ingress_spec'],
'services_spec': content['services_spec']
}


def create_or_replace_namespaced_ingress(
Expand Down
39 changes: 22 additions & 17 deletions sky/templates/kubernetes-ingress.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,29 @@ ingress_spec:
rules:
- http:
paths:
- path: /{{ path_prefix }}(/|$)(.*)
{% for service in service_names_and_ports %}
- path: /{{ service.path_prefix }}(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: {{ service_name }}
name: {{ service.service_name }}
port:
number: {{ service_port }}
service_spec:
apiVersion: v1
kind: Service
metadata:
name: {{ service_name }}
labels:
parent: skypilot
spec:
type: ClusterIP
selector:
{{ selector_key }}: {{ selector_value }}
ports:
- port: {{ service_port }}
targetPort: {{ service_port }}
number: {{ service.service_port }}
{% endfor %}
services_spec:
{% for service in service_names_and_ports %}
{{ service.service_name }}:
apiVersion: v1
kind: Service
metadata:
name: {{ service.service_name }}
labels:
parent: skypilot
spec:
type: ClusterIP
selector:
{{ selector_key }}: {{ selector_value }}
ports:
- port: {{ service.service_port }}
targetPort: {{ service.service_port }}
{% endfor %}

0 comments on commit 823999a

Please sign in to comment.