Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support instances without public IP for GCP #1341

Merged
merged 2 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/docs/reference/server/config.yml.md
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,23 @@ gcloud projects list --format="json(projectId)"
You also need to have the `serviceAccountUser` role granted.
`dstack` will run TPUs under the default service account, so you don't need to create one.

??? info "Private subnets"
By default, `dstack` utilizes public subnets and permits inbound SSH traffic exclusively for any provisioned instances.
If you want `dstack` to use private subnets, set `public_ips` to `false`.

```yaml
projects:
- name: main
backends:
- type: gcp
creds:
type: default

public_ips: false
```

Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets (e.g., through VPC peering). Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances.

### OCI

There are two ways to configure OCI: using client credentials or using the default credentials.
Expand Down
23 changes: 22 additions & 1 deletion src/dstack/_internal/core/backends/gcp/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from dstack._internal.core.backends.base.offers import get_catalog_offers
from dstack._internal.core.backends.gcp.config import GCPConfig
from dstack._internal.core.errors import (
ComputeError,
ComputeResourceNotFoundError,
NoCapacityError,
)
Expand Down Expand Up @@ -47,6 +48,7 @@ def __init__(self, config: GCPConfig):
self.firewalls_client = compute_v1.FirewallsClient(credentials=self.credentials)
self.regions_client = compute_v1.RegionsClient(credentials=self.credentials)
self.subnetworks_client = compute_v1.SubnetworksClient(credentials=self.credentials)
self.routers_client = compute_v1.RoutersClient(credentials=self.credentials)
self.tpu_client = tpu_v2.TpuClient(credentials=self.credentials)

def get_offers(
Expand Down Expand Up @@ -112,6 +114,7 @@ def create_instance(
instance_config: InstanceConfiguration,
) -> JobProvisioningData:
instance_name = instance_config.instance_name
allocate_public_ip = self.config.allocate_public_ips
if not gcp_resources.is_valid_resource_name(instance_name):
# In a rare case the instance name is invalid in GCP,
# we better use a random instance name than fail provisioning.
Expand Down Expand Up @@ -198,6 +201,18 @@ def create_instance(
)
raise NoCapacityError()

if not allocate_public_ip and not gcp_resources.has_vpc_nat_access(
routers_client=self.routers_client,
project_id=self.config.vpc_project_id or self.config.project_id,
vpc_name=self.config.vpc_resource_name,
region=instance_offer.region,
):
raise ComputeError(
"VPC does not have access to the external internet through Cloud NAT. "
f"Region: {instance_offer.region}, VPC name: {self.config.vpc_resource_name}, "
f"Project ID: {self.config.vpc_project_id or self.config.project_id}."
)

for zone in _get_instance_zones(instance_offer):
request = compute_v1.InsertInstanceRequest()
request.zone = zone
Expand All @@ -222,6 +237,7 @@ def create_instance(
zone=zone,
network=self.config.vpc_resource_name,
subnetwork=subnetwork,
allocate_public_ip=allocate_public_ip,
)
try:
operation = self.instances_client.insert(request=request)
Expand All @@ -234,11 +250,16 @@ def create_instance(
instance = self.instances_client.get(
project=self.config.project_id, zone=zone, instance=instance_name
)
if allocate_public_ip:
hostname = instance.network_interfaces[0].access_configs[0].nat_i_p
else:
hostname = instance.network_interfaces[0].network_i_p
return JobProvisioningData(
backend=instance_offer.backend,
instance_type=instance_offer.instance,
instance_id=instance_name,
hostname=instance.network_interfaces[0].access_configs[0].nat_i_p,
public_ip_enabled=allocate_public_ip,
hostname=hostname,
internal_ip=instance.network_interfaces[0].network_i_p,
region=instance_offer.region,
price=instance_offer.price,
Expand Down
6 changes: 6 additions & 0 deletions src/dstack/_internal/core/backends/gcp/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
class GCPConfig(GCPStoredConfig, BackendConfig):
creds: AnyGCPCreds

@property
def allocate_public_ips(self) -> bool:
if self.public_ips is not None:
return self.public_ips
return True

@property
def vpc_resource_name(self) -> str:
vpc_name = self.vpc_name
Expand Down
50 changes: 45 additions & 5 deletions src/dstack/_internal/core/backends/gcp/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@

def check_vpc(
network_client: compute_v1.NetworksClient,
routers_client: compute_v1.RoutersClient,
project_id: str,
regions: List[str],
allocate_public_ip: bool,
vpc_name: Optional[str] = None,
shared_vpc_project_id: Optional[str] = None,
):
Expand All @@ -46,6 +49,38 @@ def check_vpc(
except google.api_core.exceptions.NotFound:
raise ComputeError(f"Failed to find VPC {vpc_name} in project {vpc_project_id}")

if allocate_public_ip:
return

regions_without_nat = []
for region in regions:
if not has_vpc_nat_access(routers_client, vpc_project_id, vpc_name, region):
regions_without_nat.append(region)

if regions_without_nat:
raise ComputeError(
f"VPC {vpc_name} in project {vpc_project_id} does not have Cloud NAT configured for external internet access in regions: {regions_without_nat}"
)


def has_vpc_nat_access(
routers_client: compute_v1.RoutersClient,
project_id: str,
vpc_name: str,
region: str,
) -> bool:
try:
routers = routers_client.list(project=project_id, region=region)
except google.api_core.exceptions.NotFound:
return False

for router in routers:
if router.network.endswith(vpc_name):
if len(router.nats) > 0:
return True

return False


def create_instance_struct(
disk_size: int,
Expand All @@ -62,16 +97,21 @@ def create_instance_struct(
service_account: Optional[str] = None,
network: str = "global/networks/default",
subnetwork: Optional[str] = None,
allocate_public_ip: bool = True,
) -> compute_v1.Instance:
network_interface = compute_v1.NetworkInterface()
network_interface.network = network
if subnetwork is not None:
network_interface.subnetwork = subnetwork
access = compute_v1.AccessConfig()
access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name
access.name = "External NAT"
access.network_tier = access.NetworkTier.PREMIUM.name
network_interface.access_configs = [access]

if allocate_public_ip:
access = compute_v1.AccessConfig()
access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name
access.name = "External NAT"
access.network_tier = access.NetworkTier.PREMIUM.name
network_interface.access_configs = [access]
else:
network_interface.access_configs = []

instance = compute_v1.Instance()
instance.network_interfaces = [network_interface]
Expand Down
2 changes: 2 additions & 0 deletions src/dstack/_internal/core/models/backends/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class GCPConfigInfo(CoreModel):
regions: Optional[List[str]] = None
vpc_name: Optional[str] = None
vpc_project_id: Optional[str] = None
public_ips: Optional[bool] = None


class GCPServiceAccountCreds(CoreModel):
Expand Down Expand Up @@ -46,6 +47,7 @@ class GCPConfigInfoWithCredsPartial(CoreModel):
regions: Optional[List[str]]
vpc_name: Optional[str] = None
vpc_project_id: Optional[str] = None
public_ips: Optional[bool]


class GCPConfigValues(CoreModel):
Expand Down
16 changes: 14 additions & 2 deletions src/dstack/_internal/server/services/backends/configurators/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,12 @@ def get_config_values(self, config: GCPConfigInfoWithCredsPartial) -> GCPConfigV
if config.project_id is None:
return config_values
network_client = compute_v1.NetworksClient(credentials=credentials)
self._check_vpc_config(network_client=network_client, config=config)
routers_client = compute_v1.RoutersClient(credentials=credentials)
self._check_vpc_config(
network_client=network_client,
routers_client=routers_client,
config=config,
)
return config_values

def create_backend(
Expand Down Expand Up @@ -220,14 +225,21 @@ def _get_regions_element(
return element

def _check_vpc_config(
self, network_client: compute_v1.NetworksClient, config: GCPConfigInfoWithCredsPartial
self,
network_client: compute_v1.NetworksClient,
routers_client: compute_v1.RoutersClient,
config: GCPConfigInfoWithCredsPartial,
):
allocate_public_ip = config.public_ips if config.public_ips is not None else True
try:
resources.check_vpc(
network_client=network_client,
routers_client=routers_client,
project_id=config.project_id,
regions=config.regions or DEFAULT_REGIONS,
vpc_name=config.vpc_name,
shared_vpc_project_id=config.vpc_project_id,
allocate_public_ip=allocate_public_ip,
)
except ComputeError as e:
raise ServerClientError(e.args[0])
6 changes: 6 additions & 0 deletions src/dstack/_internal/server/services/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ class GCPConfig(CoreModel):
Optional[str],
Field(description="The shared VPC hosted project ID. Required for shared VPC only"),
] = None
public_ips: Annotated[
Optional[bool],
Field(
description="A flag to enable/disable public IP assigning on instances. Defaults to `true`"
),
] = None
creds: AnyGCPCreds = Field(..., description="The credentials", discriminator="type")


Expand Down
Loading