From 2b01ff787677612482e5b8556d269dd8991838fa Mon Sep 17 00:00:00 2001 From: Bryan Fang Date: Fri, 15 Mar 2024 21:08:59 +0800 Subject: [PATCH] support multiple aws accounts with credentials Signed-off-by: Bryan Fang --- cmd/aws_credentials.go | 11 + cmd/helper.go | 34 + cmd/root.go | 84 ++- .../prometheus-rds-exporter.yaml | 7 + internal/app/exporter/exporter.go | 611 +++++++++++++++++- 5 files changed, 697 insertions(+), 50 deletions(-) create mode 100644 cmd/aws_credentials.go diff --git a/cmd/aws_credentials.go b/cmd/aws_credentials.go new file mode 100644 index 0000000..8b75011 --- /dev/null +++ b/cmd/aws_credentials.go @@ -0,0 +1,11 @@ +package cmd + +type Account struct { + AwsAccessKeyID string `mapstructure:"aws_access_key_id"` + AwsSecretAccessKey string `mapstructure:"aws_secret_access_key"` + Regions []string `yaml:"regions"` +} + +type AWSCredentials struct { + Accounts []Account `yaml:"accounts"` +} diff --git a/cmd/helper.go b/cmd/helper.go index 9b08a35..d150687 100644 --- a/cmd/helper.go +++ b/cmd/helper.go @@ -4,9 +4,11 @@ import ( "context" "fmt" "log/slog" + "reflect" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/credentials/stscreds" "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" "github.com/aws/aws-sdk-go-v2/service/sts" @@ -54,3 +56,35 @@ func getAWSSessionInformation(cfg aws.Config) (string, string, error) { return *output.Account, cfg.Region, nil } + +func getAWSConfigurationByCredentials(logger *slog.Logger, configuration exporterConfig) ([]aws.Config, error) { + var configs []aws.Config + accountsFromYaml := configuration.AwsCredentials + if reflect.ValueOf(accountsFromYaml).IsZero() { + logger.Error("AWS accounts not configured in yaml") + return nil, nil + } else { + accounts := accountsFromYaml.Accounts + for _, c := range accounts { + aws_access_key_id := c.AwsAccessKeyID + aws_secret_access_key := c.AwsSecretAccessKey + staticProvider := credentials.NewStaticCredentialsProvider( + aws_access_key_id, + aws_secret_access_key, + "", + ) + cfg, err := config.LoadDefaultConfig( + context.Background(), + config.WithCredentialsProvider(staticProvider), + ) + if err != nil { + return nil, err + } + for _, region := range c.Regions { + cfg.Region = region + configs = append(configs, cfg) + } + } + } + return configs, nil +} diff --git a/cmd/root.go b/cmd/root.go index 241006c..6e80631 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -28,6 +28,7 @@ const ( ) var cfgFile string +var actRegionClient []exporter.AccountRegionClients type exporterConfig struct { Debug bool `mapstructure:"debug"` @@ -46,6 +47,7 @@ type exporterConfig struct { CollectQuotas bool `mapstructure:"collect-quotas"` CollectUsages bool `mapstructure:"collect-usages"` OTELTracesEnabled bool `mapstructure:"enable-otel-traces"` + AwsCredentials AWSCredentials } func run(configuration exporterConfig) { @@ -54,24 +56,6 @@ func run(configuration exporterConfig) { fmt.Println("ERROR: Fail to initialize logger: %w", err) panic(err) } - - cfg, err := getAWSConfiguration(logger, configuration.AWSAssumeRoleArn, configuration.AWSAssumeRoleSession) - if err != nil { - logger.Error("can't initialize AWS configuration", "reason", err) - os.Exit(awsErrorExitCode) - } - - awsAccountID, awsRegion, err := getAWSSessionInformation(cfg) - if err != nil { - logger.Error("can't identify AWS account and/or region", "reason", err) - os.Exit(awsErrorExitCode) - } - - rdsClient := rds.NewFromConfig(cfg) - ec2Client := ec2.NewFromConfig(cfg) - cloudWatchClient := cloudwatch.NewFromConfig(cfg) - servicequotasClient := servicequotas.NewFromConfig(cfg) - collectorConfiguration := exporter.Configuration{ CollectInstanceMetrics: configuration.CollectInstanceMetrics, CollectInstanceTypes: configuration.CollectInstanceTypes, @@ -82,16 +66,65 @@ func run(configuration exporterConfig) { CollectUsages: configuration.CollectUsages, } - collector := exporter.NewCollector(*logger, collectorConfiguration, awsAccountID, awsRegion, rdsClient, ec2Client, cloudWatchClient, servicequotasClient) + cfgs, err := getAWSConfigurationByCredentials(logger, configuration) + if err != nil { + logger.Error("can't initialize AWS configuration", "reason", err) + os.Exit(awsErrorExitCode) + } + if cfgs == nil { + logger.Info("Didn't configure aws IAM User credentials in configuration file, will use default aws configuration") + cfg, err := getAWSConfiguration(logger, configuration.AWSAssumeRoleArn, configuration.AWSAssumeRoleSession) + if err != nil { + logger.Error("can't initialize AWS configuration", "reason", err) + os.Exit(awsErrorExitCode) + } + awsAccountID, awsRegion, err := getAWSSessionInformation(cfg) + if err != nil { + logger.Error("can't identify AWS account and/or region", "reason", err) + os.Exit(awsErrorExitCode) + } + + rdsClient := rds.NewFromConfig(cfg) + ec2Client := ec2.NewFromConfig(cfg) + cloudWatchClient := cloudwatch.NewFromConfig(cfg) + servicequotasClient := servicequotas.NewFromConfig(cfg) + + collector := exporter.NewCollector(*logger, collectorConfiguration, awsAccountID, awsRegion, rdsClient, ec2Client, cloudWatchClient, servicequotasClient) + + prometheus.MustRegister(collector) - prometheus.MustRegister(collector) + } else { + for _, cfg := range cfgs { + awsAccountID, awsRegion, err := getAWSSessionInformation(cfg) + if err != nil { + logger.Error("can't identify AWS account and/or region", "reason", err) + os.Exit(awsErrorExitCode) + } + + rdsClient := rds.NewFromConfig(cfg) + ec2Client := ec2.NewFromConfig(cfg) + cloudWatchClient := cloudwatch.NewFromConfig(cfg) + servicequotasClient := servicequotas.NewFromConfig(cfg) + + var accountRegionClients exporter.AccountRegionClients + accountRegionClients.AwsAccountID = awsAccountID + accountRegionClients.AwsRegion = awsRegion + accountRegionClients.RdsClient = rdsClient + accountRegionClients.Ec2Client = ec2Client + accountRegionClients.CloudWatchClient = cloudWatchClient + accountRegionClients.ServicequotasClient = servicequotasClient + actRegionClient = append(actRegionClient, accountRegionClients) + } + collector := exporter.NewMultiCollector(*logger, collectorConfiguration, actRegionClient) + prometheus.MustRegister(collector) + } + // http configurations for exporter service serverConfiguration := http.Config{ - ListenAddress: configuration.ListenAddress, - MetricPath: configuration.MetricPath, - TLSCertPath: configuration.TLSCertPath, - TLSKeyPath: configuration.TLSKeyPath, - OTELTracesEnabled: configuration.OTELTracesEnabled, + ListenAddress: configuration.ListenAddress, + MetricPath: configuration.MetricPath, + TLSCertPath: configuration.TLSCertPath, + TLSKeyPath: configuration.TLSKeyPath, } server := http.New(*logger, serverConfiguration) @@ -118,6 +151,7 @@ func NewRootCommand() (*cobra.Command, error) { return } + viper.UnmarshalKey("accounts", &c.AwsCredentials.Accounts) run(c) }, } diff --git a/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml b/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml index a3fcd71..aaa7037 100644 --- a/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml +++ b/configs/prometheus-rds-exporter/prometheus-rds-exporter.yaml @@ -27,6 +27,13 @@ # # AWS credentials # +# accounts: +# - aws_access_key_id: +# aws_secret_access_key: +# regions: +# - ap-northeast-1 +# - eu-central-1 +# - us-east-1 # AWS IAM ARN role to assume to fetch metrics # aws-assume-role-arn: arn:aws:iam::000000000000:role/prometheus-rds-exporter diff --git a/internal/app/exporter/exporter.go b/internal/app/exporter/exporter.go index df616c6..dc42e1a 100644 --- a/internal/app/exporter/exporter.go +++ b/internal/app/exporter/exporter.go @@ -54,6 +54,15 @@ type metrics struct { CloudWatchUsage cloudwatch.UsageMetrics } +type AccountRegionClients struct { + AwsAccountID string + AwsRegion string + RdsClient rdsClient + Ec2Client EC2Client + ServicequotasClient servicequotasClient + CloudWatchClient cloudWatchClient +} + type rdsCollector struct { ctx context.Context wg sync.WaitGroup @@ -64,6 +73,8 @@ type rdsCollector struct { awsRegion string configuration Configuration + accountRegionClients []AccountRegionClients + rdsClient rdsClient EC2Client EC2Client servicequotasClient servicequotasClient @@ -112,6 +123,226 @@ type rdsCollector struct { age *prometheus.Desc } +// Composite key for metrics Map +type AccountRegionKey struct { + AwsAccountID string + AwsRegion string +} + +type CloudWatchMetricsCounter struct { + Counters counters + CloudWatchMetrics cloudwatch.CloudWatchMetrics +} + +type ServiceQuotaMetricsCounter struct { + Counters counters + ServiceQuotaMetrics servicequotas.Metrics +} + +type EC2MetricsCounter struct { + counters counters + EC2 ec2.Metrics +} + +type CloudWatchUsageCounter struct { + counters counters + CloudWatchUsage cloudwatch.UsageMetrics +} + +/* +--------------------------------------------------------------------------------/ +Global Map for storing data per account and region +/--------------------------------------------------------------------------------- +*/ +var ServiceQuotaMetricsCounterMap map[AccountRegionKey]ServiceQuotaMetricsCounter +var CloudwatchInstancesCounterMap map[AccountRegionKey]CloudWatchMetricsCounter +var CloudWatchUsageCounterMap map[AccountRegionKey]CloudWatchUsageCounter +var EC2MetricsCounterMap map[AccountRegionKey]EC2MetricsCounter + +// Global instance metrics for RDS instances +type GlobalInstanceMetrics struct { + AwsAccountID string + AwsRegion string + RDSInstances rds.Metrics + counters counters +} + +// Define global virable to store RDS instances and metrics +var globalInstanceMetrics []GlobalInstanceMetrics + +func NewMultiCollector(logger slog.Logger, collectorConfiguration Configuration, accountRegionClients []AccountRegionClients) *rdsCollector { + return &rdsCollector{ + logger: logger, + configuration: collectorConfiguration, + accountRegionClients: accountRegionClients, + + exporterBuildInformation: prometheus.NewDesc("rds_exporter_build_info", + "A metric with constant '1' value labeled by version from which exporter was built", + []string{"version", "commit_sha", "build_date"}, nil, + ), + errors: prometheus.NewDesc("rds_exporter_errors_total", + "Total number of errors encountered by the exporter", + []string{}, nil, + ), + allocatedStorage: prometheus.NewDesc("rds_allocated_storage_bytes", + "Allocated storage", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + information: prometheus.NewDesc("rds_instance_info", + "RDS instance information", + []string{"aws_account_id", "aws_region", "dbidentifier", "dbi_resource_id", "instance_class", "engine", "engine_version", "storage_type", "multi_az", "deletion_protection", "role", "source_dbidentifier", "pending_modified_values", "pending_maintenance", "performance_insights_enabled", "ca_certificate_identifier", "arn"}, nil, + ), + age: prometheus.NewDesc("rds_instance_age_seconds", + "Time since instance creation", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + maxAllocatedStorage: prometheus.NewDesc("rds_max_allocated_storage_bytes", + "Upper limit in gibibytes to which Amazon RDS can automatically scale the storage of the DB instance", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + maxIops: prometheus.NewDesc("rds_max_disk_iops_average", + "Max IOPS for the instance", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + storageThroughput: prometheus.NewDesc("rds_max_storage_throughput_bytes", + "Max storage throughput", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + readThroughput: prometheus.NewDesc("rds_read_throughput_bytes", + "Average number of bytes read from disk per second", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + writeThroughput: prometheus.NewDesc("rds_write_throughput_bytes", + "Average number of bytes written to disk per second", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + status: prometheus.NewDesc("rds_instance_status", + fmt.Sprintf("Instance status (%d: ok, %d: can't scrap metrics)", int(exporterUpStatusCode), int(exporterDownStatusCode)), + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + logFilesSize: prometheus.NewDesc("rds_instance_log_files_size_bytes", + "Total of log files on the instance", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + instanceVCPU: prometheus.NewDesc("rds_instance_vcpu_average", + "Total vCPU for this instance class", + []string{"aws_account_id", "aws_region", "instance_class"}, nil, + ), + instanceMemory: prometheus.NewDesc("rds_instance_memory_bytes", + "Instance class memory", + []string{"aws_account_id", "aws_region", "instance_class"}, nil, + ), + instanceTags: prometheus.NewDesc("rds_instance_tags", + "AWS tags attached to the instance", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + cpuUtilisation: prometheus.NewDesc("rds_cpu_usage_percent_average", + "Instance CPU used", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + instanceMaximumThroughput: prometheus.NewDesc("rds_instance_max_throughput_bytes", + "Maximum throughput of underlying EC2 instance class", + []string{"aws_account_id", "aws_region", "instance_class"}, nil, + ), + instanceMaximumIops: prometheus.NewDesc("rds_instance_max_iops_average", + "Maximum IOPS of underlying EC2 instance class", + []string{"aws_account_id", "aws_region", "instance_class"}, nil, + ), + freeStorageSpace: prometheus.NewDesc("rds_free_storage_bytes", + "Free storage on the instance", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + databaseConnections: prometheus.NewDesc("rds_database_connections_average", + "The number of client network connections to the database instance", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + up: prometheus.NewDesc("up", + "Was the last scrape of RDS successful", + nil, nil, + ), + swapUsage: prometheus.NewDesc("rds_swap_usage_bytes", + "Amount of swap space used on the DB instance. This metric is not available for SQL Server", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + writeIOPS: prometheus.NewDesc("rds_write_iops_average", + "Average number of disk write I/O operations per second", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + readIOPS: prometheus.NewDesc("rds_read_iops_average", + "Average number of disk read I/O operations per second", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + replicaLag: prometheus.NewDesc("rds_replica_lag_seconds", + "For read replica configurations, the amount of time a read replica DB instance lags behind the source DB instance. Applies to MariaDB, Microsoft SQL Server, MySQL, Oracle, and PostgreSQL read replicas", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + replicationSlotDiskUsage: prometheus.NewDesc("rds_replication_slot_disk_usage_bytes", + "Disk space used by replication slot files. Applies to PostgreSQL", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + maximumUsedTransactionIDs: prometheus.NewDesc("rds_maximum_used_transaction_ids_average", + "Maximum transaction IDs that have been used. Applies to only PostgreSQL", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + freeableMemory: prometheus.NewDesc("rds_freeable_memory_bytes", + "Amount of available random access memory. For MariaDB, MySQL, Oracle, and PostgreSQL DB instances, this metric reports the value of the MemAvailable field of /proc/meminfo", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + apiCall: prometheus.NewDesc("rds_api_call_total", + "Number of call to AWS API", + []string{"aws_account_id", "aws_region", "api"}, nil, + ), + backupRetentionPeriod: prometheus.NewDesc("rds_backup_retention_period_seconds", + "Automatic DB snapshots retention period", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + DBLoad: prometheus.NewDesc("rds_dbload_average", + "Number of active sessions for the DB engine", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + dBLoadCPU: prometheus.NewDesc("rds_dbload_cpu_average", + "Number of active sessions where the wait event type is CPU", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + dBLoadNonCPU: prometheus.NewDesc("rds_dbload_noncpu_average", + "Number of active sessions where the wait event type is not CPU", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + transactionLogsDiskUsage: prometheus.NewDesc("rds_transaction_logs_disk_usage_bytes", + "Disk space used by transaction logs (only on PostgreSQL)", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + certificateValidTill: prometheus.NewDesc("rds_certificate_expiry_timestamp_seconds", + "Timestamp of the expiration of the Instance certificate", + []string{"aws_account_id", "aws_region", "dbidentifier"}, nil, + ), + quotaDBInstances: prometheus.NewDesc("rds_quota_max_dbinstances_average", + "Maximum number of RDS instances allowed in the AWS account", + []string{"aws_account_id", "aws_region"}, nil, + ), + quotaTotalStorage: prometheus.NewDesc("rds_quota_total_storage_bytes", + "Maximum total storage for all DB instances", + []string{"aws_account_id", "aws_region"}, nil, + ), + quotaMaxDBInstanceSnapshots: prometheus.NewDesc("rds_quota_maximum_db_instance_snapshots_average", + "Maximum number of manual DB instance snapshots", + []string{"aws_account_id", "aws_region"}, nil, + ), + usageAllocatedStorage: prometheus.NewDesc("rds_usage_allocated_storage_bytes", + "Total storage used by AWS RDS instances", + []string{"aws_account_id", "aws_region"}, nil, + ), + usageDBInstances: prometheus.NewDesc("rds_usage_db_instances_average", + "AWS RDS instance count", + []string{"aws_account_id", "aws_region"}, nil, + ), + usageManualSnapshots: prometheus.NewDesc("rds_usage_manual_snapshots_average", + "Manual snapshots count", + []string{"aws_account_id", "aws_region"}, nil, + ), + } +} + func NewCollector(logger slog.Logger, collectorConfiguration Configuration, awsAccountID string, awsRegion string, rdsClient rdsClient, ec2Client EC2Client, cloudWatchClient cloudWatchClient, servicequotasClient servicequotasClient) *rdsCollector { return &rdsCollector{ logger: logger, @@ -335,10 +566,98 @@ func (c *rdsCollector) Describe(ch chan<- *prometheus.Desc) { ch <- c.writeThroughput } +func initGlobalVirables() { + if len(globalInstanceMetrics) > 0 { + globalInstanceMetrics = nil + } + if len(ServiceQuotaMetricsCounterMap) > 0 { + ServiceQuotaMetricsCounterMap = make(map[AccountRegionKey]ServiceQuotaMetricsCounter, 0) + } + if len(CloudwatchInstancesCounterMap) > 0 { + CloudwatchInstancesCounterMap = make(map[AccountRegionKey]CloudWatchMetricsCounter, 0) + } + if len(CloudWatchUsageCounterMap) > 0 { + CloudWatchUsageCounterMap = make(map[AccountRegionKey]CloudWatchUsageCounter, 0) + } + if len(EC2MetricsCounterMap) > 0 { + EC2MetricsCounterMap = make(map[AccountRegionKey]EC2MetricsCounter, 0) + } +} + // getMetrics collects and return all RDS metrics func (c *rdsCollector) fetchMetrics() error { c.logger.Debug("received query") + if len(c.accountRegionClients) > 0 { + c.logger.Info("Deal with multiple aws account scenario") + initGlobalVirables() + var accRegionClients = c.accountRegionClients + for _, arc := range accRegionClients { + c.logger.Info("Get metrics from region " + arc.AwsRegion + " in account " + arc.AwsAccountID) + var globalInstanceMetric GlobalInstanceMetrics + c.awsAccountID = arc.AwsAccountID + c.awsRegion = arc.AwsRegion + globalInstanceMetric.AwsAccountID = arc.AwsAccountID + globalInstanceMetric.AwsRegion = arc.AwsRegion + c.logger.Debug("get service quotas metrics from " + c.awsRegion + " in " + c.awsAccountID) + // Fetch serviceQuotas metrics + if c.configuration.CollectQuotas { + go c.getQuotasMetrics(arc.ServicequotasClient) + c.wg.Add(1) + } + + c.logger.Debug("get usage metrics from " + c.awsRegion + " in " + c.awsAccountID) + // Fetch usages metrics + if c.configuration.CollectUsages { + go c.getUsagesMetrics(arc.CloudWatchClient) + c.wg.Add(1) + } + + // Fetch RDS instances metrics + c.logger.Debug("get RDS metrics from " + c.awsRegion + " in " + c.awsAccountID) + + rdsFetcher := rds.NewFetcher(c.ctx, arc.RdsClient, rds.Configuration{ + CollectLogsSize: c.configuration.CollectLogsSize, + CollectMaintenances: c.configuration.CollectMaintenances, + }) + + rdsMetrics, err := rdsFetcher.GetInstancesMetrics() + if err != nil { + return fmt.Errorf("can't fetch RDS metrics: %w", err) + } + + c.metrics.RDS = rdsMetrics + // add this metrics to global + globalInstanceMetric.RDSInstances = rdsMetrics + c.counters.RDSAPIcalls += rdsFetcher.GetStatistics().RdsAPICall + globalInstanceMetric.counters.RDSAPIcalls += rdsFetcher.GetStatistics().RdsAPICall + c.logger.Debug("RDS metrics fetched from " + c.awsRegion) + + c.logger.Debug("get EC2 metrics for instance type from " + c.awsRegion + " in " + c.awsAccountID) + // Compute uniq instances identifiers and instance types + instanceIdentifiers, instanceTypes := getUniqTypeAndIdentifiers(rdsMetrics.Instances) + + // Fetch EC2 Metrics for instance types + if c.configuration.CollectInstanceTypes && len(instanceTypes) > 0 { + go c.getEC2Metrics(arc.Ec2Client, instanceTypes) + c.wg.Add(1) + } + + c.logger.Debug("get CloudWatch metrics from " + c.awsRegion + " in " + c.awsAccountID) + // Fetch Cloudwatch metrics for instances + if c.configuration.CollectInstanceMetrics { + go c.getCloudwatchMetrics(arc.CloudWatchClient, instanceIdentifiers) + c.wg.Add(1) + } + + // Wait for all go routines to finish + c.wg.Wait() + + globalInstanceMetrics = append(globalInstanceMetrics, globalInstanceMetric) + } + return nil + } + // Fetch serviceQuotas metrics if c.configuration.CollectQuotas { go c.getQuotasMetrics(c.servicequotasClient) @@ -399,12 +718,28 @@ func (c *rdsCollector) getCloudwatchMetrics(client cloudwatch.CloudWatchClient, fetcher := cloudwatch.NewRDSFetcher(client, c.logger) metrics, err := fetcher.GetRDSInstanceMetrics(instanceIdentifiers) - if err != nil { - c.counters.Errors++ - } + if len(c.accountRegionClients) > 0 { + var accountRegionKey AccountRegionKey + accountRegionKey.AwsAccountID = c.awsAccountID + accountRegionKey.AwsRegion = c.awsRegion + _, ok := CloudwatchInstancesCounterMap[accountRegionKey] + if !ok { + CloudwatchInstancesCounterMap = make(map[AccountRegionKey]CloudWatchMetricsCounter, 10) + var cloudWatchMetricsCounter CloudWatchMetricsCounter + if err != nil { + cloudWatchMetricsCounter.Counters.Errors++ + } + cloudWatchMetricsCounter.CloudWatchMetrics = metrics + CloudwatchInstancesCounterMap[accountRegionKey] = cloudWatchMetricsCounter + } + } else { + if err != nil { + c.counters.Errors++ + } - c.counters.CloudwatchAPICalls += fetcher.GetStatistics().CloudWatchAPICall - c.metrics.CloudwatchInstances = metrics + c.counters.CloudwatchAPICalls += fetcher.GetStatistics().CloudWatchAPICall + c.metrics.CloudwatchInstances = metrics + } c.logger.Debug("cloudwatch metrics fetched", "metrics", metrics) } @@ -416,13 +751,31 @@ func (c *rdsCollector) getUsagesMetrics(client cloudwatch.CloudWatchClient) { fetcher := cloudwatch.NewUsageFetcher(c.ctx, client, c.logger) metrics, err := fetcher.GetUsageMetrics() - if err != nil { - c.counters.Errors++ - c.logger.Error(fmt.Sprintf("can't fetch usage metrics: %s", err)) - } + if len(c.accountRegionClients) > 0 { + // get usage metrics per region, the metrics will be retrieved only once per combination of account and region + var accountRegionKey AccountRegionKey + accountRegionKey.AwsAccountID = c.awsAccountID + accountRegionKey.AwsRegion = c.awsRegion + _, ok := CloudWatchUsageCounterMap[accountRegionKey] + if !ok { + CloudWatchUsageCounterMap = make(map[AccountRegionKey]CloudWatchUsageCounter, 10) + var cloudWatchUsageCounter CloudWatchUsageCounter + if err != nil { + cloudWatchUsageCounter.counters.Errors++ + c.logger.Error(fmt.Sprintf("can't fetch usage metrics: %s", err)) + } + cloudWatchUsageCounter.CloudWatchUsage = metrics + CloudWatchUsageCounterMap[accountRegionKey] = cloudWatchUsageCounter + } + } else { + if err != nil { + c.counters.Errors++ + c.logger.Error(fmt.Sprintf("can't fetch usage metrics: %s", err)) + } - c.counters.UsageAPIcalls += fetcher.GetStatistics().CloudWatchAPICall - c.metrics.CloudWatchUsage = metrics + c.counters.UsageAPIcalls += fetcher.GetStatistics().CloudWatchAPICall + c.metrics.CloudWatchUsage = metrics + } c.logger.Debug("usage metrics fetched", "metrics", metrics) } @@ -434,13 +787,33 @@ func (c *rdsCollector) getEC2Metrics(client ec2.EC2Client, instanceTypes []strin fetcher := ec2.NewFetcher(c.ctx, client) metrics, err := fetcher.GetDBInstanceTypeInformation(instanceTypes) - if err != nil { - c.counters.Errors++ - c.logger.Error(fmt.Sprintf("can't fetch EC2 metrics: %s", err)) - } + if len(c.accountRegionClients) > 0 { + // retrieve all instances and put them together to map[account,region]instances + // TO-DO + var accountRegionKey AccountRegionKey + accountRegionKey.AwsAccountID = c.awsAccountID + accountRegionKey.AwsRegion = c.awsRegion + _, ok := EC2MetricsCounterMap[accountRegionKey] + if !ok { + EC2MetricsCounterMap = make(map[AccountRegionKey]EC2MetricsCounter, 10) + var ec2MetricsCounter EC2MetricsCounter + if err != nil { + ec2MetricsCounter.counters.Errors++ + c.logger.Error(fmt.Sprintf("can't fetch EC2 metrics: %s", err)) + } + ec2MetricsCounter.counters.EC2APIcalls += fetcher.GetStatistics().EC2ApiCall + ec2MetricsCounter.EC2 = metrics + EC2MetricsCounterMap[accountRegionKey] = ec2MetricsCounter + } + } else { + if err != nil { + c.counters.Errors++ + c.logger.Error(fmt.Sprintf("can't fetch EC2 metrics: %s", err)) + } - c.counters.EC2APIcalls += fetcher.GetStatistics().EC2ApiCall - c.metrics.EC2 = metrics + c.counters.EC2APIcalls += fetcher.GetStatistics().EC2ApiCall + c.metrics.EC2 = metrics + } c.logger.Debug("EC2 metrics fetched", "metrics", metrics) } @@ -456,15 +829,34 @@ func (c *rdsCollector) getQuotasMetrics(client servicequotas.ServiceQuotasClient fetcher := servicequotas.NewFetcher(ctx, client) metrics, err := fetcher.GetRDSQuotas() - if err != nil { - c.counters.Errors++ - c.logger.Error(fmt.Sprintf("can't fetch service quota metrics: %s", err)) - span.SetStatus(codes.Error, "can't fetch service quota metrics") - span.RecordError(err) - } + if len(c.accountRegionClients) > 0 { + // retrieve once per account and region + var accountRegionKey AccountRegionKey + accountRegionKey.AwsAccountID = c.awsAccountID + accountRegionKey.AwsRegion = c.awsRegion + _, ok := ServiceQuotaMetricsCounterMap[accountRegionKey] + if !ok { + ServiceQuotaMetricsCounterMap = make(map[AccountRegionKey]ServiceQuotaMetricsCounter, 10) + var serviceQuotaMetricsCounter ServiceQuotaMetricsCounter + if err != nil { + serviceQuotaMetricsCounter.Counters.Errors++ + c.logger.Error(fmt.Sprintf("can't fetch service quota metrics: %s", err)) + } + serviceQuotaMetricsCounter.Counters.ServiceQuotasAPICalls += fetcher.GetStatistics().UsageAPICall + serviceQuotaMetricsCounter.ServiceQuotaMetrics = metrics + ServiceQuotaMetricsCounterMap[accountRegionKey] = serviceQuotaMetricsCounter + } + } else { + if err != nil { + c.counters.Errors++ + c.logger.Error(fmt.Sprintf("can't fetch service quota metrics: %s", err)) + span.SetStatus(codes.Error, "can't fetch service quota metrics") + span.RecordError(err) + } - c.counters.ServiceQuotasAPICalls += fetcher.GetStatistics().UsageAPICall - c.metrics.ServiceQuota = metrics + c.counters.ServiceQuotasAPICalls += fetcher.GetStatistics().UsageAPICall + c.metrics.ServiceQuota = metrics + } span.SetStatus(codes.Ok, "quota fetched") } @@ -515,6 +907,175 @@ func (c *rdsCollector) Collect(ch chan<- prometheus.Metric) { span.End() + // RDS metrics + if len(c.accountRegionClients) > 0 { + c.logger.Info("Building RDS metrics for configured AWS Accounts........") + for _, instanceMetric := range globalInstanceMetrics { + ch <- prometheus.MustNewConstMetric(c.apiCall, prometheus.CounterValue, instanceMetric.counters.RDSAPIcalls, instanceMetric.AwsAccountID, instanceMetric.AwsRegion, "rds") + var instances = instanceMetric.RDSInstances.Instances + for dbidentifier, instance := range instances { + ch <- prometheus.MustNewConstMetric( + c.allocatedStorage, + prometheus.GaugeValue, + float64(instance.AllocatedStorage), + instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier, + ) + ch <- prometheus.MustNewConstMetric( + c.information, + prometheus.GaugeValue, + 1, + instanceMetric.AwsAccountID, + instanceMetric.AwsRegion, + dbidentifier, + instance.DbiResourceID, + instance.DBInstanceClass, + instance.Engine, + instance.EngineVersion, + instance.StorageType, + strconv.FormatBool(instance.MultiAZ), + strconv.FormatBool(instance.DeletionProtection), + instance.Role, + instance.SourceDBInstanceIdentifier, + strconv.FormatBool(instance.PendingModifiedValues), + instance.PendingMaintenanceAction, + strconv.FormatBool(instance.PerformanceInsightsEnabled), + instance.CACertificateIdentifier, + instance.Arn, + ) + ch <- prometheus.MustNewConstMetric(c.maxAllocatedStorage, prometheus.GaugeValue, float64(instance.MaxAllocatedStorage), instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier) + ch <- prometheus.MustNewConstMetric(c.maxIops, prometheus.GaugeValue, float64(instance.MaxIops), instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier) + ch <- prometheus.MustNewConstMetric(c.status, prometheus.GaugeValue, float64(instance.Status), instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier) + ch <- prometheus.MustNewConstMetric(c.storageThroughput, prometheus.GaugeValue, float64(instance.StorageThroughput), instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier) + ch <- prometheus.MustNewConstMetric(c.backupRetentionPeriod, prometheus.GaugeValue, float64(instance.BackupRetentionPeriod), instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier) + + if c.configuration.CollectInstanceTags { + names, values := c.getInstanceTagLabels(dbidentifier, instance) + + c.instanceTags = prometheus.NewDesc("rds_instance_tags", "AWS tags attached to the instance", names, nil) + ch <- prometheus.MustNewConstMetric(c.instanceTags, prometheus.GaugeValue, 0, values...) + } + + if instance.CertificateValidTill != nil { + ch <- prometheus.MustNewConstMetric(c.certificateValidTill, prometheus.GaugeValue, float64(instance.CertificateValidTill.Unix()), instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier) + } + + if instance.Age != nil { + ch <- prometheus.MustNewConstMetric(c.age, prometheus.GaugeValue, *instance.Age, instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier) + } + + if instance.LogFilesSize != nil { + ch <- prometheus.MustNewConstMetric(c.logFilesSize, prometheus.GaugeValue, float64(*instance.LogFilesSize), instanceMetric.AwsAccountID, instanceMetric.AwsRegion, dbidentifier) + } + } + } + + // usage metrics + if c.configuration.CollectUsages { + for k, v := range CloudWatchUsageCounterMap { + ch <- prometheus.MustNewConstMetric(c.apiCall, prometheus.CounterValue, v.counters.UsageAPIcalls, k.AwsAccountID, k.AwsRegion, "usage") + ch <- prometheus.MustNewConstMetric(c.usageAllocatedStorage, prometheus.GaugeValue, v.CloudWatchUsage.AllocatedStorage, k.AwsAccountID, k.AwsRegion) + ch <- prometheus.MustNewConstMetric(c.usageDBInstances, prometheus.GaugeValue, v.CloudWatchUsage.DBInstances, k.AwsAccountID, k.AwsRegion) + ch <- prometheus.MustNewConstMetric(c.usageManualSnapshots, prometheus.GaugeValue, v.CloudWatchUsage.ManualSnapshots, k.AwsAccountID, k.AwsRegion) + } + } + + for k, v := range CloudwatchInstancesCounterMap { + // Cloudwatch metrics + ch <- prometheus.MustNewConstMetric(c.apiCall, prometheus.CounterValue, v.Counters.CloudwatchAPICalls, k.AwsAccountID, k.AwsRegion, "cloudwatch") + + var cwinstances = v.CloudWatchMetrics.Instances + for dbidentifier, instance := range cwinstances { + if instance.DatabaseConnections != nil { + ch <- prometheus.MustNewConstMetric(c.databaseConnections, prometheus.GaugeValue, *instance.DatabaseConnections, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.FreeStorageSpace != nil { + ch <- prometheus.MustNewConstMetric(c.freeStorageSpace, prometheus.GaugeValue, *instance.FreeStorageSpace, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.FreeableMemory != nil { + ch <- prometheus.MustNewConstMetric(c.freeableMemory, prometheus.GaugeValue, *instance.FreeableMemory, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.MaximumUsedTransactionIDs != nil { + ch <- prometheus.MustNewConstMetric(c.maximumUsedTransactionIDs, prometheus.GaugeValue, *instance.MaximumUsedTransactionIDs, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.ReadThroughput != nil { + ch <- prometheus.MustNewConstMetric(c.readThroughput, prometheus.GaugeValue, *instance.ReadThroughput, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.ReplicaLag != nil { + ch <- prometheus.MustNewConstMetric(c.replicaLag, prometheus.GaugeValue, *instance.ReplicaLag, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.ReplicationSlotDiskUsage != nil { + ch <- prometheus.MustNewConstMetric(c.replicationSlotDiskUsage, prometheus.GaugeValue, *instance.ReplicationSlotDiskUsage, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.SwapUsage != nil { + ch <- prometheus.MustNewConstMetric(c.swapUsage, prometheus.GaugeValue, *instance.SwapUsage, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.ReadIOPS != nil { + ch <- prometheus.MustNewConstMetric(c.readIOPS, prometheus.GaugeValue, *instance.ReadIOPS, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.WriteIOPS != nil { + ch <- prometheus.MustNewConstMetric(c.writeIOPS, prometheus.GaugeValue, *instance.WriteIOPS, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.WriteThroughput != nil { + ch <- prometheus.MustNewConstMetric(c.writeThroughput, prometheus.GaugeValue, *instance.WriteThroughput, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.TransactionLogsDiskUsage != nil { + ch <- prometheus.MustNewConstMetric(c.transactionLogsDiskUsage, prometheus.GaugeValue, *instance.TransactionLogsDiskUsage, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.DBLoad != nil { + ch <- prometheus.MustNewConstMetric(c.DBLoad, prometheus.GaugeValue, *instance.DBLoad, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.CPUUtilization != nil { + ch <- prometheus.MustNewConstMetric(c.cpuUtilisation, prometheus.GaugeValue, *instance.CPUUtilization, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.DBLoadCPU != nil { + ch <- prometheus.MustNewConstMetric(c.dBLoadCPU, prometheus.GaugeValue, *instance.DBLoadCPU, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + + if instance.DBLoadNonCPU != nil { + ch <- prometheus.MustNewConstMetric(c.dBLoadNonCPU, prometheus.GaugeValue, *instance.DBLoadNonCPU, k.AwsAccountID, k.AwsRegion, dbidentifier) + } + } + + // serviceQuotas metrics + if c.configuration.CollectQuotas { + for k, v := range ServiceQuotaMetricsCounterMap { + ch <- prometheus.MustNewConstMetric(c.apiCall, prometheus.CounterValue, v.Counters.ServiceQuotasAPICalls, k.AwsAccountID, k.AwsRegion, "servicequotas") + ch <- prometheus.MustNewConstMetric(c.quotaDBInstances, prometheus.GaugeValue, v.ServiceQuotaMetrics.DBinstances, k.AwsAccountID, k.AwsRegion) + ch <- prometheus.MustNewConstMetric(c.quotaTotalStorage, prometheus.GaugeValue, v.ServiceQuotaMetrics.TotalStorage, k.AwsAccountID, k.AwsRegion) + ch <- prometheus.MustNewConstMetric(c.quotaMaxDBInstanceSnapshots, prometheus.GaugeValue, v.ServiceQuotaMetrics.ManualDBInstanceSnapshots, k.AwsAccountID, k.AwsRegion) + } + } + + for k, v := range EC2MetricsCounterMap { + // EC2 metrics + ch <- prometheus.MustNewConstMetric(c.apiCall, prometheus.CounterValue, v.counters.EC2APIcalls, k.AwsAccountID, k.AwsRegion, "ec2") + var ec2instances = v.EC2.Instances + for instanceType, instance := range ec2instances { + ch <- prometheus.MustNewConstMetric(c.instanceMaximumIops, prometheus.GaugeValue, float64(instance.MaximumIops), k.AwsAccountID, k.AwsRegion, instanceType) + ch <- prometheus.MustNewConstMetric(c.instanceMaximumThroughput, prometheus.GaugeValue, instance.MaximumThroughput, k.AwsAccountID, k.AwsRegion, instanceType) + ch <- prometheus.MustNewConstMetric(c.instanceMemory, prometheus.GaugeValue, float64(instance.Memory), k.AwsAccountID, k.AwsRegion, instanceType) + ch <- prometheus.MustNewConstMetric(c.instanceVCPU, prometheus.GaugeValue, float64(instance.Vcpu), k.AwsAccountID, k.AwsRegion, instanceType) + } + } + } + c.logger.Info("Metrics are fetched completely.") + return + } + ch <- prometheus.MustNewConstMetric(c.up, prometheus.CounterValue, exporterUpStatusCode) // RDS metrics