plugins/outputs/stackdriver/stackdriver.go

//go:generate ../../../tools/readme_config_includer/generator
package stackdriver

import (
	"context"
	_ "embed"
	"errors"
	"fmt"
	"hash/fnv"
	"path"
	"sort"
	"strconv"
	"strings"

	monitoring "cloud.google.com/go/monitoring/apiv3/v2"
	"cloud.google.com/go/monitoring/apiv3/v2/monitoringpb"
	"google.golang.org/api/option"
	"google.golang.org/genproto/googleapis/api/distribution"
	metricpb "google.golang.org/genproto/googleapis/api/metric"
	monitoredrespb "google.golang.org/genproto/googleapis/api/monitoredres"
	"google.golang.org/grpc/status"
	"google.golang.org/protobuf/types/known/timestamppb"

	"github.com/influxdata/telegraf"
	"github.com/influxdata/telegraf/filter"
	"github.com/influxdata/telegraf/internal"
	"github.com/influxdata/telegraf/plugins/outputs"
)

//go:embed sample.conf
var sampleConfig string

// Stackdriver is the Google Stackdriver config info.
type Stackdriver struct {
	Project              string            `toml:"project"`
	Namespace            string            `toml:"namespace"`
	ResourceType         string            `toml:"resource_type"`
	ResourceLabels       map[string]string `toml:"resource_labels"`
	MetricTypePrefix     string            `toml:"metric_type_prefix"`
	MetricNameFormat     string            `toml:"metric_name_format"`
	MetricDataType       string            `toml:"metric_data_type"`
	TagsAsResourceLabels []string          `toml:"tags_as_resource_label"`
	MetricCounter        []string          `toml:"metric_counter"`
	MetricGauge          []string          `toml:"metric_gauge"`
	MetricHistogram      []string          `toml:"metric_histogram"`
	Log                  telegraf.Logger   `toml:"-"`

	client          *monitoring.MetricClient
	counterCache    *counterCache
	filterCounter   filter.Filter
	filterGauge     filter.Filter
	filterHistogram filter.Filter
}

const (
	// The user-defined limits are documented below:
	// https://cloud.google.com/monitoring/quotas#custom_metrics_quotas

	// QuotaLabelsPerMetricDescriptor is the limit
	// to labels (tags) per metric descriptor.
	QuotaLabelsPerMetricDescriptor = 30
	// QuotaStringLengthForLabelKey is the limit
	// to string length for label key.
	QuotaStringLengthForLabelKey = 100
	// QuotaStringLengthForLabelValue is the limit
	// to string length for label value.
	QuotaStringLengthForLabelValue = 1024

	// MaxInt is the max int64 value.
	MaxInt = int(^uint(0) >> 1)
)

func (s *Stackdriver) Init() error {
	if s.MetricTypePrefix == "" {
		s.MetricTypePrefix = "custom.googleapis.com"
	}

	switch s.MetricNameFormat {
	case "":
		s.MetricNameFormat = "path"
	case "path", "official":
	default:
		return fmt.Errorf("unrecognized metric name format: %s", s.MetricNameFormat)
	}

	switch s.MetricDataType {
	case "":
		s.MetricDataType = "source"
	case "source", "double":
	default:
		return fmt.Errorf("unrecognized metric data type: %s", s.MetricDataType)
	}

	var err error
	s.filterCounter, err = filter.Compile(s.MetricCounter)
	if err != nil {
		return fmt.Errorf("creating counter filter failed: %w", err)
	}
	s.filterGauge, err = filter.Compile(s.MetricGauge)
	if err != nil {
		return fmt.Errorf("creating gauge filter failed: %w", err)
	}
	s.filterHistogram, err = filter.Compile(s.MetricHistogram)
	if err != nil {
		return fmt.Errorf("creating histogram filter failed: %w", err)
	}

	return nil
}

func (*Stackdriver) SampleConfig() string {
	return sampleConfig
}

// Connect initiates the primary connection to the GCP project.
func (s *Stackdriver) Connect() error {
	if s.Project == "" {
		return errors.New("project is a required field for stackdriver output")
	}

	if s.Namespace == "" {
		s.Log.Warn("plugin-level namespace is empty")
	}

	if s.ResourceType == "" {
		s.ResourceType = "global"
	}

	if s.ResourceLabels == nil {
		s.ResourceLabels = make(map[string]string, 1)
	}

	if s.counterCache == nil {
		s.counterCache = NewCounterCache(s.Log)
	}

	s.ResourceLabels["project_id"] = s.Project

	if s.client == nil {
		ctx := context.Background()
		client, err := monitoring.NewMetricClient(ctx, option.WithUserAgent(internal.ProductToken()))
		if err != nil {
			return err
		}
		s.client = client
	}

	return nil
}

// Sorted returns a copy of the metrics in time ascending order.  A copy is
// made to avoid modifying the input metric slice since doing so is not
// allowed.
func sorted(metrics []telegraf.Metric) []telegraf.Metric {
	batch := make([]telegraf.Metric, 0, len(metrics))
	for i := len(metrics) - 1; i >= 0; i-- {
		batch = append(batch, metrics[i])
	}
	sort.Slice(batch, func(i, j int) bool {
		return batch[i].Time().Before(batch[j].Time())
	})
	return batch
}

type timeSeriesBuckets map[uint64][]*monitoringpb.TimeSeries

func (tsb timeSeriesBuckets) Add(m telegraf.Metric, f []*telegraf.Field, ts *monitoringpb.TimeSeries) {
	h := fnv.New64a()
	h.Write([]byte(m.Name()))
	h.Write([]byte{'\n'})
	for _, field := range f {
		h.Write([]byte(field.Key))
		h.Write([]byte{'\n'})
	}
	for key, value := range m.Tags() {
		h.Write([]byte(key))
		h.Write([]byte{'\n'})
		h.Write([]byte(value))
		h.Write([]byte{'\n'})
	}
	k := h.Sum64()

	s := tsb[k]
	s = append(s, ts)
	tsb[k] = s
}

// Split metrics up by timestamp and send to Google Cloud Stackdriver
func (s *Stackdriver) Write(metrics []telegraf.Metric) error {
	metricBatch := make(map[int64][]telegraf.Metric)
	timestamps := make([]int64, 0, len(metrics))
	for _, metric := range sorted(metrics) {
		timestamp := metric.Time().UnixNano()
		if existingSlice, ok := metricBatch[timestamp]; ok {
			metricBatch[timestamp] = append(existingSlice, metric)
		} else {
			metricBatch[timestamp] = []telegraf.Metric{metric}
			timestamps = append(timestamps, timestamp)
		}
	}

	// sort the timestamps we collected
	sort.Slice(timestamps, func(i, j int) bool { return timestamps[i] < timestamps[j] })

	s.Log.Debugf("received %d metrics\n", len(metrics))
	s.Log.Debugf("split into %d groups by timestamp\n", len(metricBatch))
	for _, timestamp := range timestamps {
		if err := s.sendBatch(metricBatch[timestamp]); err != nil {
			return err
		}
	}

	return nil
}

// Write the metrics to Google Cloud Stackdriver.
func (s *Stackdriver) sendBatch(batch []telegraf.Metric) error {
	ctx := context.Background()

	buckets := make(timeSeriesBuckets)
	for _, m := range batch {
		// Set metric types based on user-provided filter
		metricType := m.Type()
		if s.filterCounter != nil && s.filterCounter.Match(m.Name()) {
			metricType = telegraf.Counter
		}
		if s.filterGauge != nil && s.filterGauge.Match(m.Name()) {
			metricType = telegraf.Gauge
		}
		if s.filterHistogram != nil && s.filterHistogram.Match(m.Name()) {
			metricType = telegraf.Histogram
		}

		metricKind, err := getStackdriverMetricKind(metricType)
		if err != nil {
			s.Log.Errorf("Get kind for metric %q (%T) failed: %s", m.Name(), metricType, err)
			continue
		}

		// Convert any declared tag to a resource label and remove it from
		// the metric
		resourceLabels := make(map[string]string, len(s.ResourceLabels)+len(s.TagsAsResourceLabels))
		for k, v := range s.ResourceLabels {
			resourceLabels[k] = v
		}
		for _, tag := range s.TagsAsResourceLabels {
			if val, ok := m.GetTag(tag); ok {
				resourceLabels[tag] = val
				m.RemoveTag(tag)
			}
		}

		if m.Type() == telegraf.Histogram {
			value, err := s.buildHistogram(m)
			if err != nil {
				s.Log.Errorf("Unable to build distribution from metric %s: %s", m, err)
				continue
			}

			startTime, endTime := getStackdriverIntervalEndpoints(metricKind, value, m, nil, s.counterCache)
			timeInterval, err := getStackdriverTimeInterval(metricKind, startTime, endTime)
			if err != nil {
				s.Log.Errorf("Get time interval failed: %s", err)
				continue
			}

			// Prepare an individual data point.
			dataPoint := &monitoringpb.Point{
				Interval: timeInterval,
				Value:    value,
			}

			// Prepare time series.
			timeSeries := &monitoringpb.TimeSeries{
				Metric: &metricpb.Metric{
					Type:   s.generateHistogramName(m),
					Labels: s.getStackdriverLabels(m.TagList()),
				},
				MetricKind: metricKind,
				Resource: &monitoredrespb.MonitoredResource{
					Type:   s.ResourceType,
					Labels: resourceLabels,
				},
				Points: []*monitoringpb.Point{
					dataPoint,
				},
			}

			buckets.Add(m, m.FieldList(), timeSeries)
			continue
		}

		for _, f := range m.FieldList() {
			value, err := s.getStackdriverTypedValue(f.Value)
			if err != nil {
				s.Log.Errorf("Get type failed: %q", err)
				continue
			}
			if value == nil {
				continue
			}

			startTime, endTime := getStackdriverIntervalEndpoints(metricKind, value, m, f, s.counterCache)
			timeInterval, err := getStackdriverTimeInterval(metricKind, startTime, endTime)
			if err != nil {
				s.Log.Errorf("Get time interval failed: %s", err)
				continue
			}

			// Prepare an individual data point.
			dataPoint := &monitoringpb.Point{
				Interval: timeInterval,
				Value:    value,
			}

			// Prepare time series.
			timeSeries := &monitoringpb.TimeSeries{
				Metric: &metricpb.Metric{
					Type:   s.generateMetricName(m, metricType, f.Key),
					Labels: s.getStackdriverLabels(m.TagList()),
				},
				MetricKind: metricKind,
				Resource: &monitoredrespb.MonitoredResource{
					Type:   s.ResourceType,
					Labels: resourceLabels,
				},
				Points: []*monitoringpb.Point{
					dataPoint,
				},
			}

			buckets.Add(m, []*telegraf.Field{f}, timeSeries)

			// If the metric is untyped, it will end with unknown. We will also
			// send another metric with the unknown:counter suffix. Google will
			// do some heuristics to know which one to use for queries. This
			// only occurs when using the official name format.
			if s.MetricNameFormat == "official" && strings.HasSuffix(timeSeries.Metric.Type, "unknown") {
				metricKind := metricpb.MetricDescriptor_CUMULATIVE
				startTime, endTime := getStackdriverIntervalEndpoints(metricKind, value, m, f, s.counterCache)
				timeInterval, err := getStackdriverTimeInterval(metricKind, startTime, endTime)
				if err != nil {
					s.Log.Errorf("Get time interval failed: %s", err)
					continue
				}
				dataPoint := &monitoringpb.Point{
					Interval: timeInterval,
					Value:    value,
				}

				counterTimeSeries := &monitoringpb.TimeSeries{
					Metric: &metricpb.Metric{
						Type:   s.generateMetricName(m, metricType, f.Key) + ":counter",
						Labels: s.getStackdriverLabels(m.TagList()),
					},
					MetricKind: metricpb.MetricDescriptor_CUMULATIVE,
					Resource: &monitoredrespb.MonitoredResource{
						Type:   s.ResourceType,
						Labels: resourceLabels,
					},
					Points: []*monitoringpb.Point{
						dataPoint,
					},
				}
				buckets.Add(m, []*telegraf.Field{f}, counterTimeSeries)
			}
		}
	}

	// process the buckets in order
	keys := make([]uint64, 0, len(buckets))
	for k := range buckets {
		keys = append(keys, k)
	}
	sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })

	for len(buckets) != 0 {
		// can send up to 200 time series to stackdriver
		timeSeries := make([]*monitoringpb.TimeSeries, 0, 200)
		for i := 0; i < len(keys) && len(timeSeries) < cap(timeSeries); i++ {
			k := keys[i]
			s := buckets[k]
			timeSeries = append(timeSeries, s[0])
			if len(s) == 1 {
				delete(buckets, k)
				keys = append(keys[:i], keys[i+1:]...)
				i--
				continue
			}

			s = s[1:]
			buckets[k] = s
		}

		// Prepare time series request.
		timeSeriesRequest := &monitoringpb.CreateTimeSeriesRequest{
			Name:       "projects/" + s.Project,
			TimeSeries: timeSeries,
		}

		// Create the time series in Stackdriver.
		err := s.client.CreateTimeSeries(ctx, timeSeriesRequest)
		if err != nil {
			if errStatus, ok := status.FromError(err); ok {
				if errStatus.Code().String() == "InvalidArgument" {
					s.Log.Warnf("Unable to write to Stackdriver - dropping metrics: %s", err)
					return nil
				}
			}

			s.Log.Errorf("Unable to write to Stackdriver: %s", err)
			return err
		}
	}

	return nil
}

func (s *Stackdriver) generateMetricName(m telegraf.Metric, metricType telegraf.ValueType, key string) string {
	if s.MetricNameFormat == "path" {
		return path.Join(s.MetricTypePrefix, s.Namespace, m.Name(), key)
	}

	name := m.Name() + "_" + key
	if s.Namespace != "" {
		name = s.Namespace + "_" + m.Name() + "_" + key
	}

	var kind string
	switch metricType {
	case telegraf.Gauge:
		kind = "gauge"
	case telegraf.Untyped:
		kind = "unknown"
	case telegraf.Counter:
		kind = "counter"
	case telegraf.Histogram:
		kind = "histogram"
	default:
		kind = ""
	}

	return path.Join(s.MetricTypePrefix, name, kind)
}

func (s *Stackdriver) generateHistogramName(m telegraf.Metric) string {
	if s.MetricNameFormat == "path" {
		return path.Join(s.MetricTypePrefix, s.Namespace, m.Name())
	}

	name := m.Name()
	if s.Namespace != "" {
		name = s.Namespace + "_" + m.Name()
	}

	return path.Join(s.MetricTypePrefix, name, "histogram")
}

func getStackdriverIntervalEndpoints(
	kind metricpb.MetricDescriptor_MetricKind,
	value *monitoringpb.TypedValue,
	m telegraf.Metric,
	f *telegraf.Field,
	cc *counterCache,
) (*timestamppb.Timestamp, *timestamppb.Timestamp) {
	endTime := timestamppb.New(m.Time())
	var startTime *timestamppb.Timestamp
	if kind == metricpb.MetricDescriptor_CUMULATIVE {
		// Interval starts for stackdriver CUMULATIVE metrics must reset any time
		// the counter resets, so we keep a cache of the start times and last
		// observed values for each counter in the batch.
		startTime = cc.GetStartTime(GetCounterCacheKey(m, f), value, endTime)
	}
	return startTime, endTime
}

func getStackdriverTimeInterval(m metricpb.MetricDescriptor_MetricKind, startTime, endTime *timestamppb.Timestamp) (*monitoringpb.TimeInterval, error) {
	switch m {
	case metricpb.MetricDescriptor_GAUGE:
		return &monitoringpb.TimeInterval{
			EndTime: endTime,
		}, nil
	case metricpb.MetricDescriptor_CUMULATIVE:
		return &monitoringpb.TimeInterval{
			StartTime: startTime,
			EndTime:   endTime,
		}, nil
	case metricpb.MetricDescriptor_DELTA, metricpb.MetricDescriptor_METRIC_KIND_UNSPECIFIED:
		fallthrough
	default:
		return nil, fmt.Errorf("unsupported metric kind %T", m)
	}
}

func getStackdriverMetricKind(vt telegraf.ValueType) (metricpb.MetricDescriptor_MetricKind, error) {
	switch vt {
	case telegraf.Untyped:
		return metricpb.MetricDescriptor_GAUGE, nil
	case telegraf.Gauge:
		return metricpb.MetricDescriptor_GAUGE, nil
	case telegraf.Counter:
		return metricpb.MetricDescriptor_CUMULATIVE, nil
	case telegraf.Histogram:
		return metricpb.MetricDescriptor_CUMULATIVE, nil
	case telegraf.Summary:
		fallthrough
	default:
		return metricpb.MetricDescriptor_METRIC_KIND_UNSPECIFIED, fmt.Errorf("unsupported telegraf value type: %T", vt)
	}
}

func (s *Stackdriver) getStackdriverTypedValue(value interface{}) (*monitoringpb.TypedValue, error) {
	if s.MetricDataType == "double" {
		v, err := internal.ToFloat64(value)
		if err != nil {
			return nil, err
		}

		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_DoubleValue{
				DoubleValue: v,
			},
		}, nil
	}

	switch v := value.(type) {
	case uint64:
		if v <= uint64(MaxInt) {
			return &monitoringpb.TypedValue{
				Value: &monitoringpb.TypedValue_Int64Value{
					Int64Value: int64(v),
				},
			}, nil
		}
		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_Int64Value{
				Int64Value: int64(MaxInt),
			},
		}, nil
	case int64:
		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_Int64Value{
				Int64Value: v,
			},
		}, nil
	case float64:
		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_DoubleValue{
				DoubleValue: v,
			},
		}, nil
	case bool:
		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_BoolValue{
				BoolValue: v,
			},
		}, nil
	case string:
		// String value types are not available for custom metrics
		return nil, nil
	default:
		return nil, fmt.Errorf("value type \"%T\" not supported for stackdriver custom metrics", v)
	}
}

func (s *Stackdriver) buildHistogram(m telegraf.Metric) (*monitoringpb.TypedValue, error) {
	sumInter, ok := m.GetField("sum")
	if !ok {
		return nil, errors.New("no sum field present")
	}
	sum, err := internal.ToFloat64(sumInter)
	if err != nil {
		return nil, fmt.Errorf("unable to convert sum value to float64: %w", err)
	}
	m.RemoveField("sum")

	countInter, ok := m.GetField("count")
	if !ok {
		return nil, errors.New("no count field present")
	}
	count, err := internal.ToFloat64(countInter)
	if err != nil {
		return nil, fmt.Errorf("unable to convert count value to float64: %w", err)
	}
	m.RemoveField("count")

	// Build map of the buckets and their values
	buckets := make([]float64, 0)
	bucketCounts := make([]int64, 0)
	for _, field := range m.FieldList() {
		// Add the +inf value to bucket counts, no need to define a bound
		if strings.Contains(strings.ToLower(field.Key), "+inf") {
			count, err := internal.ToInt64(field.Value)
			if err != nil {
				continue
			}
			bucketCounts = append(bucketCounts, count)
			continue
		}

		bucket, err := strconv.ParseFloat(field.Key, 64)
		if err != nil {
			continue
		}

		count, err := internal.ToInt64(field.Value)
		if err != nil {
			continue
		}

		buckets = append(buckets, bucket)
		bucketCounts = append(bucketCounts, count)
	}

	sort.Slice(buckets, func(i, j int) bool {
		return buckets[i] < buckets[j]
	})
	sort.Slice(bucketCounts, func(i, j int) bool {
		return bucketCounts[i] < bucketCounts[j]
	})

	// Bucket counts contain the count for a specific bucket, not the running
	// total like Prometheus histograms use. Loop backwards to determine the
	// count of each bucket rather than the running total count.
	for i := len(bucketCounts) - 1; i > 0; i-- {
		bucketCounts[i] = bucketCounts[i] - bucketCounts[i-1]
	}

	v := &monitoringpb.TypedValue{
		Value: &monitoringpb.TypedValue_DistributionValue{
			DistributionValue: &distribution.Distribution{
				Count:        int64(count),
				Mean:         sum / count,
				BucketCounts: bucketCounts,
				BucketOptions: &distribution.Distribution_BucketOptions{
					Options: &distribution.Distribution_BucketOptions_ExplicitBuckets{
						ExplicitBuckets: &distribution.Distribution_BucketOptions_Explicit{
							Bounds: buckets,
						},
					},
				},
			},
		},
	}

	return v, nil
}

func (s *Stackdriver) getStackdriverLabels(tags []*telegraf.Tag) map[string]string {
	labels := make(map[string]string)
	for _, t := range tags {
		labels[t.Key] = t.Value
	}
	for k, v := range labels {
		if len(k) > QuotaStringLengthForLabelKey {
			s.Log.Warnf("Removing tag %q key exceeds string length for label key [%d]", k, QuotaStringLengthForLabelKey)
			delete(labels, k)
			continue
		}
		if len(v) > QuotaStringLengthForLabelValue {
			s.Log.Warnf("Removing tag %q value exceeds string length for label value [%d]", k, QuotaStringLengthForLabelValue)
			delete(labels, k)
			continue
		}
	}
	if len(labels) > QuotaLabelsPerMetricDescriptor {
		excess := len(labels) - QuotaLabelsPerMetricDescriptor
		s.Log.Warnf("Tag count [%d] exceeds quota for stackdriver labels [%d] removing [%d] random tags", len(labels), QuotaLabelsPerMetricDescriptor, excess)
		for k := range labels {
			if excess == 0 {
				break
			}
			excess--
			delete(labels, k)
		}
	}

	return labels
}

// Close will terminate the session to the backend, returning error if an issue arises.
func (s *Stackdriver) Close() error {
	return s.client.Close()
}

func newStackdriver() *Stackdriver {
	return &Stackdriver{}
}

func init() {
	outputs.Add("stackdriver", func() telegraf.Output {
		return newStackdriver()
	})
}