Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update GCP API #58

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ require (
)

replace (
cloud.google.com/go/compute v1.33.0 => ../../google/cloud.google.com/go/compute
github.com/aws/aws-sdk-go-v2 v1.32.4 => github.com/pkedy/aws-sdk-go-v2 v0.0.0-20241115203348-0198b6c98cd9
github.com/aws/aws-sdk-go-v2/config v1.28.4 => github.com/pkedy/aws-sdk-go-v2/config v0.0.0-20241115203348-0198b6c98cd9
github.com/aws/aws-sdk-go-v2/credentials v1.17.45 => github.com/pkedy/aws-sdk-go-v2/credentials v0.0.0-20241115203348-0198b6c98cd9
Expand Down
8 changes: 2 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
cloud.google.com/go v0.118.1 h1:b8RATMcrK9A4BH0rj8yQupPXp+aP+cJ0l6H7V9osV1E=
cloud.google.com/go v0.118.1/go.mod h1:CFO4UPEPi8oV21xoezZCrd3d81K4fFkDTEJu4R8K+9M=
cloud.google.com/go/auth v0.14.0 h1:A5C4dKV/Spdvxcl0ggWwWEzzP7AZMJSEIgrkngwhGYM=
cloud.google.com/go/auth v0.14.0/go.mod h1:CYsoRL1PdiDuqeQpZE0bP2pnPrGqFcOkI0nldEQis+A=
cloud.google.com/go/auth/oauth2adapt v0.2.7 h1:/Lc7xODdqcEw8IrZ9SvwnlLX6j9FHQM74z6cBk9Rw6M=
cloud.google.com/go/auth/oauth2adapt v0.2.7/go.mod h1:NTbTTzfvPl1Y3V1nPpOgl2w6d/FjO7NNUQaWSox6ZMc=
cloud.google.com/go/compute v1.33.0 h1:abGcwWokP7/bBpvRjUKlgchrZXYgRpwcKZIlNUHWf6Y=
cloud.google.com/go/compute v1.33.0/go.mod h1:Z8NErRhrWA3RmVWczlAPJjZcRTlqZB1pcpD0MaIc1ug=
cloud.google.com/go/compute/metadata v0.6.0 h1:A6hENjEsCDtC1k8byVsgwvVcioamEHvZ4j01OwKxG9I=
cloud.google.com/go/compute/metadata v0.6.0/go.mod h1:FjyFAW1MW0C203CEOMDTu3Dk1FlqW3Rga40jzHL4hfg=
github.com/agrea/ptr v0.2.0 h1:QSyCkddC52uOrIvkypI8vTqUFw0KAnP71u1JU36EvBk=
Expand Down Expand Up @@ -178,8 +174,8 @@ go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY=
go.opentelemetry.io/otel v1.34.0/go.mod h1:OWFPOQ+h4G8xpyjgqo4SxJYdDQ/qmRH+wivy7zzx9oI=
go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ=
go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE=
go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A=
go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU=
go.opentelemetry.io/otel/sdk v1.32.0 h1:RNxepc9vK59A8XsgZQouW8ue8Gkb4jpWtJm9ge5lEG4=
go.opentelemetry.io/otel/sdk v1.32.0/go.mod h1:LqgegDBjKMmb2GC6/PrTnteJG39I8/vJCAP9LlJXEjU=
go.opentelemetry.io/otel/sdk/metric v1.32.0 h1:rZvFnvmvawYb0alrYkjraqJq0Z4ZUJAiyYCU9snn1CU=
go.opentelemetry.io/otel/sdk/metric v1.32.0/go.mod h1:PWeZlq0zt9YkYAp3gjKZ0eicRYvOh1Gd+X99x6GHpCQ=
go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k=
Expand Down
57 changes: 26 additions & 31 deletions pkg/providers/gcp/instance_topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ import (
"context"
"fmt"
"strconv"
"strings"

"cloud.google.com/go/compute/apiv1/computepb"
"cloud.google.com/go/compute/apiv2alpha/computepb"
"github.com/agrea/ptr"
"google.golang.org/api/iterator"
"k8s.io/klog/v2"
Expand All @@ -36,30 +35,31 @@ func (p *baseProvider) generateInstanceTopology(ctx context.Context, pageSize *i
return nil, fmt.Errorf("failed to get client: %v", err)
}

projectID, err := client.ProjectID(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get project ID: %v", err)
}

topo := topology.NewClusterTopology()

for _, ci := range cis {
if err := p.generateRegionInstanceTopology(ctx, client, projectID, topo, &ci); err != nil {
if err := p.generateRegionInstanceTopology(ctx, client, topo, &ci); err != nil {
return nil, fmt.Errorf("failed to get instance topology: %v", err)
}
}

return topo, nil
}

func (p *baseProvider) generateRegionInstanceTopology(ctx context.Context, client Client, projectID string, topo *topology.ClusterTopology, ci *topology.ComputeInstances) error {
func (p *baseProvider) generateRegionInstanceTopology(ctx context.Context, client Client, topo *topology.ClusterTopology, ci *topology.ComputeInstances) error {
if len(ci.Region) == 0 {
return fmt.Errorf("must specify region")
}
projectID, err := client.ProjectID(ctx)
if err != nil {
return fmt.Errorf("failed to get project ID: %v", err)
}
klog.InfoS("Getting instance topology", "region", ci.Region, "project", projectID)

req := computepb.ListInstancesRequest{
Project: projectID,
Zone: ci.Region,
MaxResults: client.PageSize(),
PageToken: nil,
}

for {
Expand All @@ -80,25 +80,30 @@ func (p *baseProvider) generateRegionInstanceTopology(ctx context.Context, clien
if _, ok := ci.Instances[instanceId]; ok {
if instance.ResourceStatus == nil {
klog.InfoS("ResourceStatus is not set", "instance", instanceId)
resourceStatusNotFound.WithLabelValues(instanceId).Set(1)
missingResourceStatus.WithLabelValues(instanceId).Inc()
continue
}
resourceStatusNotFound.WithLabelValues(instanceId).Set(0)

if instance.ResourceStatus.PhysicalHost == nil {
klog.InfoS("PhysicalHost is not set", "instance", instanceId)
physicalHostNotFound.WithLabelValues(instanceId).Set(1)
if instance.ResourceStatus.PhysicalHostTopology == nil {
klog.InfoS("PhysicalHostTopology is not set", "instance", instanceId)
missingPhysicalHostTopology.WithLabelValues(instanceId).Inc()
continue
}
physicalHostNotFound.WithLabelValues(instanceId).Set(0)

tokens := strings.Split(*instance.ResourceStatus.PhysicalHost, "/")
physicalHostIDChunks.WithLabelValues(instanceId).Set(float64(getTokenCount(tokens)))
if instance.ResourceStatus.PhysicalHostTopology.Cluster == nil ||
instance.ResourceStatus.PhysicalHostTopology.Block == nil ||
instance.ResourceStatus.PhysicalHostTopology.Subblock == nil {
klog.InfoS("Missing topology info", "instance", instanceId)
missingTopologyInfo.WithLabelValues(instanceId).Inc()
continue
}
inst := &topology.InstanceTopology{
InstanceID: instanceId,
SpineID: tokens[1],
BlockID: tokens[2],
InstanceID: instanceId,
DatacenterID: instance.ResourceStatus.PhysicalHostTopology.GetCluster(),
SpineID: instance.ResourceStatus.PhysicalHostTopology.GetBlock(),
BlockID: instance.ResourceStatus.PhysicalHostTopology.GetSubblock(),
}
inst.AcceleratorID = inst.BlockID
klog.Infof("Adding topology: %s", inst.String())
topo.Append(inst)
}
Expand All @@ -112,16 +117,6 @@ func (p *baseProvider) generateRegionInstanceTopology(ctx context.Context, clien
}
}

func getTokenCount(tokens []string) int {
c := 0
for _, q := range tokens {
if len(q) > 0 {
c += 1
}
}
return c
}

func castPageSize(val *int) *uint32 {
if val == nil {
return nil
Expand Down
37 changes: 0 additions & 37 deletions pkg/providers/gcp/instance_topology_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,49 +17,12 @@
package gcp

import (
"strings"
"testing"

"github.com/agrea/ptr"
"github.com/stretchr/testify/require"
)

func TestGetTokenCount(t *testing.T) {
testCases := []struct {
name string
physicaHostID string
count int
}{
{
name: "Regular Test case",
physicaHostID: "/AA/BB/CC",
count: 3,
},
{
name: "Rack ID missing in physical host ID",
physicaHostID: "/AA//CC",
count: 2,
},
{
name: "No cluster ID, rack ID or host ID",
physicaHostID: "///",
count: 0,
},
{
name: "Empty physical host ID",
physicaHostID: "",
count: 0,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tokens := strings.Split(tc.physicaHostID, "/")
count := getTokenCount(tokens)
require.Equal(t, tc.count, count)
})
}
}

func TestCastPageSize(t *testing.T) {
testCases := []struct {
name string
Expand Down
36 changes: 18 additions & 18 deletions pkg/providers/gcp/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,34 +21,34 @@ import (
)

var (
physicalHostIDChunks = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "physical_host_id_chunks",
Subsystem: "topogen_gcp",
Help: "Number of chunks in physical host ID as in /AA/BB/CCC",
missingTopologyInfo = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "missing_topology_info",
Subsystem: "topograph_gcp",
Help: "Number of times instance topology not found",
}, []string{"instance_name"},
)

resourceStatusNotFound = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "resource_status_not_found",
Subsystem: "topogen_gcp",
missingResourceStatus = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "missing_resource_status",
Subsystem: "topograph_gcp",
Help: "Number of times resource status not found",
}, []string{"instance_name"},
)

physicalHostNotFound = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "physical_host_not_found",
Subsystem: "topogen_gcp",
Help: "Number of times physical host not found",
missingPhysicalHostTopology = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "missing_physical_host_topology",
Subsystem: "topograph_gcp",
Help: "Number of times physical host topology not found",
}, []string{"instance_name"},
)

requestLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "request_latency",
Subsystem: "topogen_gcp",
Subsystem: "topograph_gcp",
Help: "Latency of requests",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
Expand All @@ -57,8 +57,8 @@ var (
)

func init() {
prometheus.MustRegister(physicalHostIDChunks)
prometheus.MustRegister(resourceStatusNotFound)
prometheus.MustRegister(physicalHostNotFound)
prometheus.MustRegister(missingTopologyInfo)
prometheus.MustRegister(missingResourceStatus)
prometheus.MustRegister(missingPhysicalHostTopology)
prometheus.MustRegister(requestLatency)
}
20 changes: 16 additions & 4 deletions pkg/providers/gcp/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ package gcp
import (
"context"
"fmt"
"strings"
"time"

compute_v1 "cloud.google.com/go/compute/apiv1"
computepb "cloud.google.com/go/compute/apiv1/computepb"
compute_v2 "cloud.google.com/go/compute/apiv2alpha"
computepb "cloud.google.com/go/compute/apiv2alpha/computepb"
"cloud.google.com/go/compute/metadata"
gax "github.com/googleapis/gax-go/v2"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -50,7 +51,7 @@ type Client interface {
}

type gcpClient struct {
instanceClient *compute_v1.InstancesClient
instanceClient *compute_v2.InstancesClient
pageSize *uint32
}

Expand All @@ -75,7 +76,7 @@ func NamedLoader() (string, providers.Loader) {

func Loader(ctx context.Context, config providers.Config) (providers.Provider, error) {
clientFactory := func(pageSize *int) (Client, error) {
instanceClient, err := compute_v1.NewInstancesRESTClient(ctx)
instanceClient, err := compute_v2.NewInstancesRESTClient(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get instances client: %s", err.Error())
}
Expand All @@ -90,6 +91,17 @@ func Loader(ctx context.Context, config providers.Config) (providers.Provider, e
}

func (p *baseProvider) GenerateTopologyConfig(ctx context.Context, pageSize *int, instances []topology.ComputeInstances) (*topology.Vertex, error) {
// TODO: remove this work-around
for i, ci := range instances {
for inst, host := range ci.Instances {
if indx := strings.LastIndex(inst, "/"); indx != -1 {
delete(instances[i].Instances, inst)
id := inst[indx+1:]
instances[i].Instances[id] = host
}
}
}

topo, err := p.generateInstanceTopology(ctx, pageSize, instances)
if err != nil {
return nil, err
Expand Down
9 changes: 6 additions & 3 deletions pkg/providers/gcp/provider_sim.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"fmt"
"strconv"

computepb "cloud.google.com/go/compute/apiv1/computepb"
"cloud.google.com/go/compute/apiv2alpha/computepb"
"github.com/agrea/ptr"
gax "github.com/googleapis/gax-go/v2"
"google.golang.org/api/iterator"
Expand Down Expand Up @@ -90,7 +90,6 @@ func (c *simClient) Instances(ctx context.Context, req *computepb.ListInstancesR

for indx = from; indx < from+int(*c.pageSize); indx++ {
node := c.model.Nodes[c.instanceIDs[indx]]
physicalHost := fmt.Sprintf("/%s/%s/%s", node.NetLayers[1], node.NetLayers[0], node.Name)
instanceID, err := strconv.ParseUint(node.Name, 10, 64)
if err != nil {
return &simInstanceIter{err: fmt.Errorf("invalid instance ID %q; must be numerical", node.Name)}, ""
Expand All @@ -99,7 +98,11 @@ func (c *simClient) Instances(ctx context.Context, req *computepb.ListInstancesR
Id: &instanceID,
Name: &node.Name,
ResourceStatus: &computepb.ResourceStatus{
PhysicalHost: &physicalHost,
PhysicalHostTopology: &computepb.ResourceStatusPhysicalHostTopology{
Cluster: &node.NetLayers[2],
Block: &node.NetLayers[1],
Subblock: &node.NetLayers[0],
},
},
}
iter.instances = append(iter.instances, instance)
Expand Down
Loading
Loading