diff --git a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml index b7482b8b5..c61d4b83c 100644 --- a/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-arc-k8s-crd.yaml @@ -19,8 +19,16 @@ metadata: name: container-insights-clusteridentityrequest namespace: azure-arc spec: - audience: https://monitoring.azure.com/ + {{- if eq (.Values.Azure.Cluster.Cloud | lower) "azurepubliccloud" }} + audience: https://monitor.azure.com/ + {{- else if eq (.Values.Azure.Cluster.Cloud | lower) "azurechinacloud" }} + audience: https://monitor.azure.cn/ + {{- else if eq (.Values.Azure.Cluster.Cloud | lower) "azureusgovernmentcloud" }} + audience: https://monitor.azure.us/ + {{- else }} + audience: https://monitor.azure.com/ + {{- end }} {{- if not (empty .Values.Azure.Extension.Name) }} - resourceId: {{ .Values.Azure.Extension.Name }} + resourceId: {{ .Values.Azure.Extension.Name }} {{- end }} {{- end }} diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index b581a324a..ef72b385b 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -69,6 +69,8 @@ spec: {{- else if ne .Values.Azure.Cluster.ResourceId "" }} - name: AKS_RESOURCE_ID value: {{ .Values.Azure.Cluster.ResourceId | quote }} + - name: USING_AAD_MSI_AUTH + value: {{ .Values.omsagent.useAADAuth | quote }} {{- if ne .Values.Azure.Cluster.Region "" }} - name: AKS_REGION value: {{ .Values.Azure.Cluster.Region | quote }} diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 3b48c26c4..5bd8bdf79 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -37,6 +37,16 @@ spec: serviceAccountName: omsagent {{- end }} containers: +{{- if and (ne .Values.Azure.Cluster.ResourceId "") (.Values.omsagent.useAADAuth) }} + - name: addon-token-adapter + imagePullPolicy: IfNotPresent + env: + - name: AZMON_COLLECT_ENV + value: "false" + - name: TOKEN_NAMESPACE + value: "azure-arc" +{{- .Values.Azure.Identity.MSIAdapterYaml | nindent 7 }} +{{- end }} - name: omsagent {{- if eq (.Values.omsagent.domain | lower) "opinsights.azure.cn" }} image: "mcr.azk8s.cn/azuremonitor/containerinsights/ciprod:{{ .Values.omsagent.image.tag }}" @@ -57,6 +67,8 @@ spec: {{- else if ne .Values.Azure.Cluster.ResourceId "" }} - name: AKS_RESOURCE_ID value: {{ .Values.Azure.Cluster.ResourceId | quote }} + - name: USING_AAD_MSI_AUTH + value: {{ .Values.omsagent.useAADAuth | quote }} {{- if ne .Values.Azure.Cluster.Region "" }} - name: AKS_REGION value: {{ .Values.Azure.Cluster.Region | quote }} @@ -159,6 +171,8 @@ spec: {{- else if ne .Values.Azure.Cluster.ResourceId "" }} - name: AKS_RESOURCE_ID value: {{ .Values.Azure.Cluster.ResourceId | quote }} + - name: USING_AAD_MSI_AUTH + value: {{ .Values.omsagent.useAADAuth | quote }} {{- if ne .Values.Azure.Cluster.Region "" }} - name: AKS_REGION value: {{ .Values.Azure.Cluster.Region | quote }} diff --git a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml index 2a60fbb7f..a0abb0f57 100644 --- a/charts/azuremonitor-containers/templates/omsagent-deployment.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-deployment.yaml @@ -33,6 +33,16 @@ spec: serviceAccountName: omsagent {{- end }} containers: +{{- if and (ne .Values.Azure.Cluster.ResourceId "") (.Values.omsagent.useAADAuth) }} + - name: addon-token-adapter + imagePullPolicy: IfNotPresent + env: + - name: AZMON_COLLECT_ENV + value: "false" + - name: TOKEN_NAMESPACE + value: "azure-arc" +{{- .Values.Azure.Identity.MSIAdapterYaml | nindent 7 }} +{{- end }} - name: omsagent {{- if eq (.Values.omsagent.domain | lower) "opinsights.azure.cn" }} image: "mcr.azk8s.cn/azuremonitor/containerinsights/ciprod:{{ .Values.omsagent.image.tag }}" @@ -53,6 +63,8 @@ spec: {{- else if ne .Values.Azure.Cluster.ResourceId "" }} - name: AKS_RESOURCE_ID value: {{ .Values.Azure.Cluster.ResourceId | quote }} + - name: USING_AAD_MSI_AUTH + value: {{ .Values.omsagent.useAADAuth | quote }} {{- if ne .Values.Azure.Cluster.Region "" }} - name: AKS_REGION value: {{ .Values.Azure.Cluster.Region | quote }} diff --git a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml index a167e99a5..fe9b65973 100644 --- a/charts/azuremonitor-containers/templates/omsagent-rbac.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-rbac.yaml @@ -30,18 +30,9 @@ rules: verbs: ["list"] - apiGroups: ["clusterconfig.azure.com"] resources: ["azureclusteridentityrequests", "azureclusteridentityrequests/status"] - resourceNames: ["container-insights-clusteridentityrequest"] - verbs: ["get", "create", "patch"] + verbs: ["get", "create", "patch", "list", "update", "delete"] - nonResourceURLs: ["/metrics"] verbs: ["get"] -#arc k8s extension model grants access as part of the extension msi -#remove this explicit permission once the extension available in public preview -{{- if (empty .Values.Azure.Extension.Name) }} -- apiGroups: [""] - resources: ["secrets"] - resourceNames: ["container-insights-clusteridentityrequest-token"] - verbs: ["get"] -{{- end }} --- kind: ClusterRoleBinding {{- if .Capabilities.APIVersions.Has "rbac.authorization.k8s.io/v1" }} diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 4460f7756..104efb86d 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -7,6 +7,7 @@ ## Values of under Azure are being populated by Azure Arc K8s RP during the installation of the extension Azure: Cluster: + Cloud: Region: ResourceId: Extension: @@ -45,6 +46,9 @@ omsagent: # if set to true additional agent workflow logs will be emitted which are used for e2e and arc k8s conformance testing ISTEST: false + # This flag used to determine whether to use AAD MSI auth or not for Arc K8s cluster + useAADAuth: false + ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 6482daed9..184af787d 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -302,10 +302,19 @@ function Set-EnvironmentVariables { Write-Host "Failed to set environment variable AGENT_VERSION for target 'machine' since it is either null or empty" } + $kubernetesPort = [System.Environment]::GetEnvironmentVariable("KUBERNETES_PORT_443_TCP_PORT", "process") + if (![string]::IsNullOrEmpty($kubernetesPort)) { + [System.Environment]::SetEnvironmentVariable("KUBERNETES_PORT_443_TCP_PORT", $kubernetesPort, "machine") + Write-Host "Successfully set environment variable KUBERNETES_PORT_443_TCP_PORT - $($kubernetesPort) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable KUBERNETES_PORT_443_TCP_PORT for target 'machine' since it is either null or empty" + } + # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb .\setenv.ps1 - + #Parse the configmap to set the right environment variables for agent config. ruby /opt/omsagentwindows/scripts/ruby/tomlparser-agent-config.rb .\setagentenv.ps1 diff --git a/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json index 28996f4a1..c42a1d074 100644 --- a/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json +++ b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterOnboarding.json @@ -14,10 +14,10 @@ "description": "Location of the AKS resource e.g. \"East US\"" } }, - "aksResourceTagValues": { + "resourceTagValues": { "type": "object", "metadata": { - "description": "Existing all tags on AKS Cluster Resource" + "description": "Existing or new tags to use on AKS, ContainerInsights and DataCollectionRule Resources" } }, "workspaceLocation": { @@ -31,12 +31,6 @@ "metadata": { "description": "Full Resource ID of the log analitycs workspace that will be used for data destination. For example /subscriptions/00000000-0000-0000-0000-0000-00000000/resourceGroups/ResourceGroupName/providers/Microsoft.operationalinsights/workspaces/ws_xyz" } - }, - "dcrResourceTagValues": { - "type": "object", - "metadata": { - "description": "Existing or new tags on DCR Resource" - } } }, "variables": { @@ -70,7 +64,7 @@ "apiVersion": "2019-11-01-preview", "name": "[variables('dcrName')]", "location": "[parameters('workspaceLocation')]", - "tags": "[parameters('dcrResourceTagValues')]", + "tags": "[parameters('resourceTagValues')]", "kind": "Linux", "properties": { "dataSources": { @@ -184,7 +178,7 @@ "name": "[variables('clusterName')]", "type": "Microsoft.ContainerService/managedClusters", "location": "[parameters('aksResourceLocation')]", - "tags": "[parameters('aksResourceTagValues')]", + "tags": "[parameters('resourceTagValues')]", "apiVersion": "2018-03-31", "properties": { "mode": "Incremental", diff --git a/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json index 31f0f9c49..e0f9a643f 100644 --- a/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json +++ b/scripts/onboarding/aks/onboarding-using-msi-auth/existingClusterParam.json @@ -8,20 +8,13 @@ "aksResourceLocation": { "value": "" }, - "aksResourceTagValues": { - "value": { - "": "", - "": "", - "": "" - } - }, "workspaceResourceId": { "value": "/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/" }, "workspaceLocation": { "value": "" }, - "dcrResourceTagValues": { + "resourceTagValues": { "value": { "": "", "": "", diff --git a/scripts/onboarding/templates/arc-k8s-extension-msi-auth/existingClusterOnboarding.json b/scripts/onboarding/templates/arc-k8s-extension-msi-auth/existingClusterOnboarding.json new file mode 100644 index 000000000..a4a4e3453 --- /dev/null +++ b/scripts/onboarding/templates/arc-k8s-extension-msi-auth/existingClusterOnboarding.json @@ -0,0 +1,224 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterResourceId": { + "type": "string", + "metadata": { + "description": "Resource Id of the Azure Arc Connected Cluster" + } + }, + "clusterRegion": { + "type": "string", + "metadata": { + "description": "Location of the Azure Arc Connected Cluster Resource e.g. \"eastus\"" + } + }, + "workspaceResourceId": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Resource ID" + } + }, + "workspaceRegion": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Workspace region e.g. \"eastus\"" + } + }, + "workspaceDomain": { + "type": "string", + "allowedValues": [ + "opinsights.azure.com", + "opinsights.azure.cn", + "opinsights.azure.us", + "opinsights.azure.eaglex.ic.gov", + "opinsights.azure.microsoft.scloud" + ], + "defaultValue": "opinsights.azure.com", + "metadata": { + "description": "Azure Monitor Log Analytics Workspace Domain e.g. opinsights.azure.com" + } + }, + "resourceTagValues": { + "type": "object", + "metadata": { + "description": "Existing or new tags to use on Arc K8s ContainerInsights extension resources" + } + } + }, + "variables": { + "clusterSubscriptionId": "[split(parameters('clusterResourceId'),'/')[2]]", + "clusterResourceGroup": "[split(parameters('clusterResourceId'),'/')[4]]", + "clusterName": "[split(parameters('clusterResourceId'),'/')[8]]", + "clusterLocation": "[replace(parameters('clusterRegion'),' ', '')]", + "workspaceSubscriptionId": "[split(parameters('workspaceResourceId'),'/')[2]]", + "workspaceResourceGroup": "[split(parameters('workspaceResourceId'),'/')[4]]", + "dcrName": "[Concat('MSCI', '-', variables('clusterName'), '-', variables('clusterLocation'))]", + "associationName": "ContainerInsightsExtension", + "dataCollectionRuleId": "[resourceId(variables('workspaceSubscriptionId'), variables('workspaceResourceGroup'), 'Microsoft.Insights/dataCollectionRules', variables('dcrName'))]" + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('arc-k8s-monitoring-msi-dcr', '-', uniqueString(variables('dcrName')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[variables('workspaceSubscriptionId')]", + "resourceGroup": "[variables('workspaceResourceGroup')]", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.Insights/dataCollectionRules", + "apiVersion": "2019-11-01-preview", + "name": "[variables('dcrName')]", + "location": "[parameters('workspaceRegion')]", + "tags": "[parameters('resourceTagValues')]", + "kind": "Linux", + "properties": { + "dataSources": { + "extensions": [ + { + "name": "ContainerInsightsExtension", + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + ], + "extensionName": "ContainerInsights" + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "workspaceResourceId": "[parameters('workspaceResourceId')]", + "name": "ciworkspace" + } + ] + }, + "dataFlows": [ + { + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + ], + "destinations": [ + "ciworkspace" + ] + } + ] + } + } + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('arc-k8s-monitoring-msi-dcra', '-', uniqueString(parameters('clusterResourceId')))]", + "apiVersion": "2017-05-10", + "subscriptionId": "[variables('clusterSubscriptionId')]", + "resourceGroup": "[variables('clusterResourceGroup')]", + "dependsOn": [ + "[Concat('arc-k8s-monitoring-msi-dcr', '-', uniqueString(variables('dcrName')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.Kubernetes/connectedClusters/providers/dataCollectionRuleAssociations", + "name": "[concat(variables('clusterName'),'/microsoft.insights/', variables('associationName'))]", + "apiVersion": "2019-11-01-preview", + "properties": { + "description": "Association of data collection rule. Deleting this association will break the data collection for this AKS Cluster.", + "dataCollectionRuleId": "[variables('dataCollectionRuleId')]" + } + } + ] + }, + "parameters": {} + } + }, + { + "type": "Microsoft.Resources/deployments", + "name": "[Concat('arc-k8s-ci-extension', '-', uniqueString(parameters('clusterResourceId')))]", + "apiVersion": "2019-05-01", + "subscriptionId": "[split(parameters('clusterResourceId'),'/')[2]]", + "resourceGroup": "[split(parameters('clusterResourceId'),'/')[4]]", + "dependsOn": [ + "[Concat('arc-k8s-monitoring-msi-dcra', '-', uniqueString(parameters('clusterResourceId')))]" + ], + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": {}, + "variables": {}, + "resources": [ + { + "type": "Microsoft.KubernetesConfiguration/extensions", + "apiVersion": "2021-09-01", + "name": "azuremonitor-containers", + "location": "[parameters('clusterRegion')]", + "identity": { + "type": "systemassigned" + }, + "properties": { + "extensionType": "Microsoft.AzureMonitor.Containers", + "configurationSettings": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]", + "omsagent.domain": "[parameters('workspaceDomain')]", + "omsagent.useAADAuth": "true" + }, + "configurationProtectedSettings": { + "omsagent.secret.wsid": "[reference(parameters('workspaceResourceId'), '2015-03-20').customerId]", + "omsagent.secret.key": "[listKeys(parameters('workspaceResourceId'), '2015-03-20').primarySharedKey]" + }, + "autoUpgradeMinorVersion": true, + "releaseTrain": "Stable", + "scope": { + "Cluster": { + "releaseNamespace": "azuremonitor-containers" + } + } + }, + "scope": "[concat('Microsoft.Kubernetes/connectedClusters/', split(parameters('clusterResourceId'),'/')[8])]" + } + ] + } + } + } + ] +} diff --git a/scripts/onboarding/templates/arc-k8s-extension-msi-auth/existingClusterParam.json b/scripts/onboarding/templates/arc-k8s-extension-msi-auth/existingClusterParam.json new file mode 100644 index 000000000..8cd17ceb3 --- /dev/null +++ b/scripts/onboarding/templates/arc-k8s-extension-msi-auth/existingClusterParam.json @@ -0,0 +1,28 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterResourceId": { + "value": "/subscriptions//resourceGroups//providers/Microsoft.Kubernetes/connectedClusters/" + }, + "clusterRegion": { + "value": "" + }, + "workspaceResourceId": { + "value": "/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/" + }, + "workspaceRegion": { + "value": "" + }, + "workspaceDomain": { + "value": "" + }, + "resourceTagValues": { + "value": { + "": "", + "": "", + "": "" + } + } + } +} diff --git a/source/plugins/go/src/go.mod b/source/plugins/go/src/go.mod index 9f30afab1..1960f82b8 100644 --- a/source/plugins/go/src/go.mod +++ b/source/plugins/go/src/go.mod @@ -13,6 +13,7 @@ require ( github.com/tinylib/msgp v1.1.2 github.com/ugorji/go v1.1.2-0.20180813092308-00b869d2f4a5 gopkg.in/natefinch/lumberjack.v2 v2.0.0-20170531160350-a96e63847dc3 + k8s.io/api v0.21.0 k8s.io/apimachinery v0.21.0 k8s.io/client-go v0.21.0 ) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 4f245a514..81039f966 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -1,6 +1,7 @@ package main import ( + "context" "encoding/json" "errors" "fmt" @@ -12,6 +13,9 @@ import ( "strconv" "strings" "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) const IMDSTokenPathForWindows = "c:/etc/imds-access-token/token" // only used in windows @@ -29,6 +33,12 @@ var IngestionAuthToken string var IngestionAuthTokenExpiration int64 var AMCSRedirectedEndpoint string = "" +// Arc k8s MSI related +const ArcK8sClusterConfigCRDAPIVersion = "clusterconfig.azure.com/v1beta1" +const ArcK8sClusterIdentityResourceName = "container-insights-clusteridentityrequest" +const ArcK8sClusterIdentityResourceNameSpace = "azure-arc" +const ArcK8sMSITokenSecretNameSpace = "azure-arc" + type IMDSResponse struct { AccessToken string `json:"access_token"` ClientID string `json:"client_id"` @@ -66,8 +76,8 @@ type AgentConfiguration struct { } `json:"channels"` Extensionconfigurations struct { Containerinsights []struct { - ID string `json:"id"` - Originids []string `json:"originIds"` + ID string `json:"id"` + Originids []string `json:"originIds"` Outputstreams struct { LinuxPerfBlob string `json:"LINUX_PERF_BLOB"` ContainerInventoryBlob string `json:"CONTAINER_INVENTORY_BLOB"` @@ -93,14 +103,47 @@ type IngestionTokenResponse struct { Ingestionauthtoken string `json:"ingestionAuthToken"` } +type ContainerInsightsIdentityRequest struct { + APIVersion string `json:"apiVersion"` + Kind string `json:"kind"` + Metadata struct { + Annotations struct { + MetaHelmShReleaseName string `json:"meta.helm.sh/release-name"` + MetaHelmShReleaseNamespace string `json:"meta.helm.sh/release-namespace"` + } `json:"annotations"` + CreationTimestamp time.Time `json:"creationTimestamp"` + Generation int `json:"generation"` + Labels struct { + AppKubernetesIoManagedBy string `json:"app.kubernetes.io/managed-by"` + } `json:"labels"` + Name string `json:"name"` + Namespace string `json:"namespace"` + ResourceVersion string `json:"resourceVersion"` + SelfLink string `json:"selfLink"` + UID string `json:"uid"` + } `json:"metadata"` + Spec struct { + Audience string `json:"audience"` + ResourceID string `json:"resourceId"` + } `json:"spec"` + Status struct { + ExpirationTime time.Time `json:"expirationTime"` + TokenReference struct { + DataName string `json:"dataName"` + SecretName string `json:"secretName"` + } `json:"tokenReference"` + } `json:"status"` +} + func getAccessTokenFromIMDS() (string, int64, error) { Log("Info getAccessTokenFromIMDS: start") useIMDSTokenProxyEndPoint := os.Getenv("USE_IMDS_TOKEN_PROXY_END_POINT") imdsAccessToken := "" + var expiration int64 var responseBytes []byte var err error - if (useIMDSTokenProxyEndPoint != "" && strings.Compare(strings.ToLower(useIMDSTokenProxyEndPoint), "true") == 0) { + if useIMDSTokenProxyEndPoint != "" && strings.Compare(strings.ToLower(useIMDSTokenProxyEndPoint), "true") == 0 { Log("Info Reading IMDS Access Token from IMDS Token proxy endpoint") mcsEndpoint := os.Getenv("MCS_ENDPOINT") msi_endpoint_string := fmt.Sprintf("http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://%s/", mcsEndpoint) @@ -108,12 +151,12 @@ func getAccessTokenFromIMDS() (string, int64, error) { msi_endpoint, err := url.Parse(msi_endpoint_string) if err != nil { Log("getAccessTokenFromIMDS: Error creating IMDS endpoint URL: %s", err.Error()) - return imdsAccessToken, 0, err + return imdsAccessToken, expiration, err } req, err := http.NewRequest("GET", msi_endpoint.String(), nil) if err != nil { Log("getAccessTokenFromIMDS: Error creating HTTP request: %s", err.Error()) - return imdsAccessToken, 0, err + return imdsAccessToken, expiration, err } req.Header.Add("Metadata", "true") @@ -133,14 +176,14 @@ func getAccessTokenFromIMDS() (string, int64, error) { } if resp != nil && resp.Body != nil { - defer resp.Body.Close() + defer resp.Body.Close() } Log("getAccessTokenFromIMDS: IMDS Response Status: %d, retryCount: %d", resp.StatusCode, retryCount) - if IsRetriableError(resp.StatusCode) { + if IsRetriableError(resp.StatusCode) { message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + retryDelay := time.Duration((retryCount+1)*100) * time.Millisecond if resp.StatusCode == 429 { if resp != nil && resp.Header.Get("Retry-After") != "" { after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) @@ -155,44 +198,103 @@ func getAccessTokenFromIMDS() (string, int64, error) { message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) SendException(message) - return imdsAccessToken, 0, err + return imdsAccessToken, expiration, err } IsSuccess = true break // call succeeded, don't retry any more } if !IsSuccess || resp == nil || resp.Body == nil { Log("getAccessTokenFromIMDS: IMDS Request ran out of retries") - return imdsAccessToken, 0, err + return imdsAccessToken, expiration, err } // Pull out response body responseBytes, err = ioutil.ReadAll(resp.Body) if err != nil { Log("getAccessTokenFromIMDS: Error reading response body: %s", err.Error()) - return imdsAccessToken, 0, err + return imdsAccessToken, expiration, err } } else { - Log("Info Reading IMDS Access Token from file : %s", IMDSTokenPathForWindows) - if _, err = os.Stat(IMDSTokenPathForWindows); os.IsNotExist(err) { - Log("getAccessTokenFromIMDS: IMDS token file doesnt exist: %s", err.Error()) - return imdsAccessToken, 0, err - } - //adding retries incase if we ended up reading the token file while the token file being written - for retryCount := 0; retryCount < MaxRetries; retryCount++ { - responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) - if err != nil { - Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s, retryCount: %d", err.Error(), retryCount) - time.Sleep(time.Duration((retryCount + 1) * 100) * time.Millisecond) - continue + resourceId := os.Getenv("AKS_RESOURCE_ID") + if resourceId != "" && strings.Contains(strings.ToLower(resourceId), strings.ToLower("Microsoft.ContainerService/managedClusters")) { + Log("Info Reading IMDS Access Token from file : %s", IMDSTokenPathForWindows) + if _, err = os.Stat(IMDSTokenPathForWindows); os.IsNotExist(err) { + Log("getAccessTokenFromIMDS: IMDS token file doesnt exist: %s", err.Error()) + return imdsAccessToken, expiration, err + } + //adding retries incase if we ended up reading the token file while the token file being written + for retryCount := 0; retryCount < MaxRetries; retryCount++ { + responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) + if err != nil { + Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s, retryCount: %d", err.Error(), retryCount) + time.Sleep(time.Duration((retryCount+1)*100) * time.Millisecond) + continue + } + break + } + } else { + Log("getAccessTokenFromIMDS: Info Getting MSI Access Token reference from CRD and token from secret for Azure Arc K8s cluster") + var crdResponseBytes []byte + var errorMessage string + for retryCount := 0; retryCount < MaxRetries; retryCount++ { + crd_request_endpoint := fmt.Sprintf("/apis/%s/namespaces/%s/azureclusteridentityrequests/%s", ArcK8sClusterConfigCRDAPIVersion, ArcK8sClusterIdentityResourceNameSpace, ArcK8sClusterIdentityResourceName) + crdResponseBytes, err = ClientSet.RESTClient().Get().AbsPath(crd_request_endpoint).DoRaw(context.TODO()) + if err != nil { + Log("getAccessTokenFromIMDS: Failed to get the CRD: %s in namespace: %s, retryCount: %d", ArcK8sClusterIdentityResourceName, ArcK8sClusterIdentityResourceNameSpace, err.Error(), retryCount) + time.Sleep(time.Duration((retryCount+1)*100) * time.Millisecond) + continue + } + break } - break - } - } + if crdResponseBytes != nil { + var ciCRDRequest ContainerInsightsIdentityRequest + err = json.Unmarshal(crdResponseBytes, &ciCRDRequest) + if err != nil { + errorMessage = fmt.Sprintf("getAccessTokenFromIMDS: Error unmarshalling the crdResponseBytes: %s", err.Error()) + Log(errorMessage) + return imdsAccessToken, expiration, errors.New(errorMessage) + } else { + status := ciCRDRequest.Status + tokenReference := status.TokenReference + dataFieldName := tokenReference.DataName + secretName := tokenReference.SecretName + expirationTime := status.ExpirationTime + if dataFieldName == "" || secretName == "" || expirationTime.IsZero() { + errorMessage = "getAccessTokenFromIMDS: Either dataName or SecretName or ExpirationTime values empty which indicates token not refreshed" + Log(errorMessage) + Log("getAccessTokenFromIMDS: dataName: %s, secretName: %s, expirationTime: %s", dataFieldName, secretName, expirationTime) + return imdsAccessToken, expiration, errors.New(errorMessage) + } else { + var secret *v1.Secret + for retryCount := 0; retryCount < MaxRetries; retryCount++ { + secret, err = ClientSet.CoreV1().Secrets(ArcK8sMSITokenSecretNameSpace).Get(context.TODO(), secretName, metav1.GetOptions{}) + if err != nil { + Log("getAccessTokenFromIMDS: Failed to read the secret: %s in namespace: %s, error: %s, retryCount: %d", secretName, ArcK8sMSITokenSecretNameSpace, err.Error(), retryCount) + time.Sleep(time.Duration((retryCount+1)*100) * time.Millisecond) + continue + } + break + } + if secret == nil { + errorMessage = fmt.Sprintf("getAccessTokenFromIMDS: value of secret: %s in nil in namespace: %s", secretName, ArcK8sMSITokenSecretNameSpace) + return imdsAccessToken, expiration, errors.New(errorMessage) + } + imdsAccessToken = string(secret.Data[dataFieldName]) + expiration = expirationTime.Unix() + return imdsAccessToken, expiration, nil + } + } + } else { + errorMessage = fmt.Sprintf("getAccessTokenFromIMDS: faled to get the CRD: %s in namespace: %s", ArcK8sClusterIdentityResourceName, ArcK8sClusterIdentityResourceNameSpace) + return imdsAccessToken, expiration, errors.New(errorMessage) + } + } + } - if responseBytes == nil { + if responseBytes == nil { Log("getAccessTokenFromIMDS: Error responseBytes is nil") - return imdsAccessToken, 0, err + return imdsAccessToken, expiration, err } // Unmarshall response body into struct @@ -200,14 +302,14 @@ func getAccessTokenFromIMDS() (string, int64, error) { err = json.Unmarshal(responseBytes, &imdsResponse) if err != nil { Log("getAccessTokenFromIMDS: Error unmarshalling the response: %s", err.Error()) - return imdsAccessToken, 0, err + return imdsAccessToken, expiration, err } imdsAccessToken = imdsResponse.AccessToken - expiration, err := strconv.ParseInt(imdsResponse.ExpiresOn, 10, 64) + expiration, err = strconv.ParseInt(imdsResponse.ExpiresOn, 10, 64) if err != nil { Log("getAccessTokenFromIMDS: Error parsing ExpiresOn field from IMDS response: %s", err.Error()) - return imdsAccessToken, 0, err + return imdsAccessToken, expiration, err } Log("Info getAccessTokenFromIMDS: end") return imdsAccessToken, expiration, nil @@ -258,7 +360,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan } if resp != nil && resp.Body != nil { defer resp.Body.Close() - } + } Log("getAgentConfiguration Response Status: %d", resp.StatusCode) if resp.StatusCode == 421 { // AMCS returns redirected endpoint incase of private link agentConfigEndpoint := resp.Header.Get("x-ms-agent-config-endpoint") @@ -282,7 +384,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan if IsRetriableError(resp.StatusCode) { message := fmt.Sprintf("getAgentConfiguration: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + retryDelay := time.Duration((retryCount+1)*100) * time.Millisecond if resp.StatusCode == 429 { if resp != nil && resp.Header.Get("Retry-After") != "" { after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) @@ -382,7 +484,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann req.Header.Add("Authorization", bearer) var resp *http.Response = nil - IsSuccess := false + IsSuccess := false for retryCount := 0; retryCount < MaxRetries; retryCount++ { // Call managed services for Azure resources token endpoint resp, err = HTTPClient.Do(req) @@ -396,7 +498,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann if resp != nil && resp.Body != nil { defer resp.Body.Close() - } + } Log("getIngestionAuthToken Response Status: %d", resp.StatusCode) if resp.StatusCode == 421 { // AMCS returns redirected endpoint incase of private link @@ -421,7 +523,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann if IsRetriableError(resp.StatusCode) { message := fmt.Sprintf("getIngestionAuthToken: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + retryDelay := time.Duration((retryCount+1)*100) * time.Millisecond if resp.StatusCode == 429 { if resp != nil && resp.Header.Get("Retry-After") != "" { after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) @@ -429,7 +531,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann retryDelay = time.Duration(after) * time.Second } } - } + } time.Sleep(retryDelay) continue } else if resp.StatusCode != 200 { @@ -504,7 +606,7 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterva func refreshIngestionAuthToken() { for ; true; <-IngestionAuthTokenRefreshTicker.C { - if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix() + 60 * 60) { // token valid 24 hrs and refresh token 1 hr before expiry + if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix()+60*60) { // token valid 24 hrs and refresh token 1 hr before expiry imdsToken, imdsTokenExpiry, err := getAccessTokenFromIMDS() if err != nil { message := fmt.Sprintf("refreshIngestionAuthToken: Error on getAccessTokenFromIMDS %s \n", err.Error()) @@ -532,7 +634,7 @@ func refreshIngestionAuthToken() { continue } } - if IMDSToken == "" || ConfigurationId == "" || ChannelId == "" { + if IMDSToken == "" || ConfigurationId == "" || ChannelId == "" { message := "refreshIngestionAuthToken: IMDSToken or ConfigurationId or ChannelId empty" Log(message) SendException(message) @@ -560,9 +662,9 @@ func refreshIngestionAuthToken() { func IsRetriableError(httpStatusCode int) bool { retryableStatusCodes := [5]int{408, 429, 502, 503, 504} for _, code := range retryableStatusCodes { - if code == httpStatusCode { - return true - } + if code == httpStatusCode { + return true + } } return false } diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 72b035d45..542f342a6 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -135,4 +135,7 @@ class Constants #This is for telemetry to track if any of the windows customer has any of the field size >= 64KB #To evaluate switching to Windows AMA 64KB impacts any existing customers MAX_RECORD_OR_FIELD_SIZE_FOR_TELEMETRY = 65536 + + # only used in windows in AAD MSI auth mode + IMDS_TOKEN_PATH_FOR_WINDOWS = "c:/etc/imds-access-token/token" end diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index e10a2049f..c83972f11 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -1,7 +1,7 @@ #!/usr/local/bin/ruby # frozen_string_literal: true -require 'fluent/plugin/output' +require "fluent/plugin/output" module Fluent::Plugin class OutputMDM < Output @@ -19,7 +19,7 @@ def initialize require_relative "constants" require_relative "arc_k8s_cluster_identity" require_relative "proxy_utils" - + require_relative "extension_utils" @@token_resource_url = "https://monitoring.azure.com/" # AAD auth supported only in public cloud and handle other clouds when enabled # this is unified new token audience for LA AAD MSI auth & metrics @@ -52,6 +52,7 @@ def initialize # Setting useMsi to false by default @useMsi = false @isAADMSIAuth = false + @isWindows = false @metrics_flushed_count = 0 @cluster_identity = nil @@ -88,6 +89,9 @@ def start aks_region = aks_region.gsub(" ", "") end + @isWindows = isWindows() + @isAADMSIAuth = ExtensionUtils.isAADMSIAuthMode() + if @can_send_data_to_mdm @log.info "MDM Metrics supported in #{aks_region} region" @@ -108,11 +112,19 @@ def start @log.info "POST Request url: #{@@post_request_url}" ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMPluginStart", {}) - # arc k8s cluster uses cluster identity if (!!@isArcK8sCluster) - @log.info "using cluster identity token since cluster is azure arc k8s cluster" - @cluster_identity = ArcK8sClusterIdentity.new - @cached_access_token = @cluster_identity.get_cluster_identity_token + if @isAADMSIAuth && !@isWindows + @log.info "using IMDS sidecar endpoint for MSI token since its Arc k8s and Linux node" + @useMsi = true + msi_endpoint = @@imds_msi_endpoint_template % { resource: @@token_resource_audience } + @parsed_token_uri = URI.parse(msi_endpoint) + @cached_access_token = get_access_token + else + # switch to IMDS endpoint for the windows once the Arc K8s team supports the IMDS sidecar for windows + @log.info "using cluster identity token since cluster is azure arc k8s cluster" + @cluster_identity = ArcK8sClusterIdentity.new + @cached_access_token = @cluster_identity.get_cluster_identity_token + end else # azure json file only used for aks and doesnt exist in non-azure envs file = File.read(@@azure_json_path) @@ -132,7 +144,6 @@ def start else # in case of aad msi auth user_assigned_client_id will be empty @log.info "using aad msi auth" - @isAADMSIAuth = true msi_endpoint = @@imds_msi_endpoint_template % { resource: @@token_resource_audience } end @parsed_token_uri = URI.parse(msi_endpoint) @@ -153,48 +164,59 @@ def get_access_token if (Time.now > @get_access_token_backoff_expiry) http_access_token = nil retries = 0 + properties = {} begin if @cached_access_token.to_s.empty? || (Time.now + 5 * 60 > @token_expiry_time) # Refresh token 5 minutes from expiration @log.info "Refreshing access token for out_mdm plugin.." - - if (!!@useMsi) - properties = {} - if (!!@isAADMSIAuth) - @log.info "Using aad msi auth to get the token to post MDM data" - properties["aadAuthMSIMode"] = "true" + if (!!@isAADMSIAuth) + properties["aadAuthMSIMode"] = "true" + end + if @isAADMSIAuth && @isWindows + @log.info "reading the token from IMDS token file since its windows.." + if File.exist?(Constants::IMDS_TOKEN_PATH_FOR_WINDOWS) && File.readable?(Constants::IMDS_TOKEN_PATH_FOR_WINDOWS) + token_content = File.read(Constants::IMDS_TOKEN_PATH_FOR_WINDOWS).strip + parsed_json = JSON.parse(token_content) + @token_expiry_time = Time.now + @@token_refresh_back_off_interval * 60 # set the expiry time to be ~ thirty minutes from current time + @cached_access_token = parsed_json["access_token"] + @log.info "Successfully got access token" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", properties) else - @log.info "Using msi to get the token to post MDM data" + raise "Either MSI Token file path doesnt exist or not readble" end - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", properties) - @log.info "Opening TCP connection" - http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => false) - # http_access_token.use_ssl = false - token_request = Net::HTTP::Get.new(@parsed_token_uri.request_uri) - token_request["Metadata"] = true else - @log.info "Using SP to get the token to post MDM data" - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-SP", {}) - @log.info "Opening TCP connection" - http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => true) - # http_access_token.use_ssl = true - token_request = Net::HTTP::Post.new(@parsed_token_uri.request_uri) - token_request.set_form_data( - { - "grant_type" => @@grant_type, - "client_id" => @data_hash["aadClientId"], - "client_secret" => @data_hash["aadClientSecret"], - "resource" => @@token_resource_url, - } - ) - end + if (!!@useMsi) + @log.info "Using msi to get the token to post MDM data" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", properties) + @log.info "Opening TCP connection" + http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => false) + # http_access_token.use_ssl = false + token_request = Net::HTTP::Get.new(@parsed_token_uri.request_uri) + token_request["Metadata"] = true + else + @log.info "Using SP to get the token to post MDM data" + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-SP", {}) + @log.info "Opening TCP connection" + http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => true) + # http_access_token.use_ssl = true + token_request = Net::HTTP::Post.new(@parsed_token_uri.request_uri) + token_request.set_form_data( + { + "grant_type" => @@grant_type, + "client_id" => @data_hash["aadClientId"], + "client_secret" => @data_hash["aadClientSecret"], + "resource" => @@token_resource_url, + } + ) + end - @log.info "making request to get token.." - token_response = http_access_token.request(token_request) - # Handle the case where the response is not 200 - parsed_json = JSON.parse(token_response.body) - @token_expiry_time = Time.now + @@token_refresh_back_off_interval * 60 # set the expiry time to be ~ thirty minutes from current time - @cached_access_token = parsed_json["access_token"] - @log.info "Successfully got access token" + @log.info "making request to get token.." + token_response = http_access_token.request(token_request) + # Handle the case where the response is not 200 + parsed_json = JSON.parse(token_response.body) + @token_expiry_time = Time.now + @@token_refresh_back_off_interval * 60 # set the expiry time to be ~ thirty minutes from current time + @cached_access_token = parsed_json["access_token"] + @log.info "Successfully got access token" + end end rescue => err @log.info "Exception in get_access_token: #{err}" @@ -316,10 +338,15 @@ def write(chunk) def send_to_mdm(post_body) begin if (!!@isArcK8sCluster) - if @cluster_identity.nil? - @cluster_identity = ArcK8sClusterIdentity.new + if @isAADMSIAuth && !@isWindows + access_token = get_access_token + else + # switch to IMDS sidecar endpoint for the windows once the Arc K8s team supports + if @cluster_identity.nil? + @cluster_identity = ArcK8sClusterIdentity.new + end + access_token = @cluster_identity.get_cluster_identity_token end - access_token = @cluster_identity.get_cluster_identity_token else access_token = get_access_token end @@ -336,7 +363,7 @@ def send_to_mdm(post_body) ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) @last_telemetry_sent_time = Time.now end - rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException + rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException if !response.nil? && !response.body.nil? #body will have actual error @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}" else @@ -368,5 +395,18 @@ def send_to_mdm(post_body) raise e end end + + def isWindows() + isWindows = false + begin + os_type = ENV["OS_TYPE"] + if !os_type.nil? && !os_type.empty? && os_type.strip.casecmp("windows") == 0 + isWindows = true + end + rescue => error + @log.warn "Error in MDM isWindows method: #{error}" + end + return isWindows + end end # class OutputMDM end # module Fluent diff --git a/test/e2e/conformance.yaml b/test/e2e/conformance.yaml index 71e40a6a2..bf9a3727a 100644 --- a/test/e2e/conformance.yaml +++ b/test/e2e/conformance.yaml @@ -3,7 +3,7 @@ sonobuoy-config: plugin-name: azure-arc-ci-conformance result-format: junit spec: - image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciconftest10202021 + image: mcr.microsoft.com/azuremonitor/containerinsights/cidev:ciconftest04282022 imagePullPolicy: Always name: plugin resources: {} diff --git a/test/e2e/src/core/e2e_tests.sh b/test/e2e/src/core/e2e_tests.sh index dd9d93073..db89adf9a 100644 --- a/test/e2e/src/core/e2e_tests.sh +++ b/test/e2e/src/core/e2e_tests.sh @@ -111,8 +111,13 @@ addArcConnectedK8sExtension() { } addArcK8sCLIExtension() { - echo "adding Arc K8s k8s-extension extension" - az extension add --name k8s-extension + if [ ! -z "$K8S_EXTENSION_WHL_URL" ]; then + echo "adding Arc K8s k8s-extension cli extension from whl file path ${K8S_EXTENSION_WHL_URL}" + az extension add --source $K8S_EXTENSION_WHL_URL -y + else + echo "adding Arc K8s k8s-extension cli extension" + az extension add --name k8s-extension + fi } createArcCIExtension() { @@ -125,7 +130,11 @@ createArcCIExtension() { basicparameters="$basicparameters --version $CI_ARC_VERSION" fi - az k8s-extension create $basicparameters --configuration-settings omsagent.ISTEST=true + if [ ! -z "$USE_AAD_AUTH" ]; then + az k8s-extension create $basicparameters --configuration-settings omsagent.ISTEST=true omsagent.useAADAuth=$USE_AAD_AUTH + else + az k8s-extension create $basicparameters --configuration-settings omsagent.ISTEST=true + fi } showArcCIExtension() { diff --git a/test/e2e/src/tests/test_e2e_workflows.py b/test/e2e/src/tests/test_e2e_workflows.py index 02ad8cf14..1515534e1 100755 --- a/test/e2e/src/tests/test_e2e_workflows.py +++ b/test/e2e/src/tests/test_e2e_workflows.py @@ -38,7 +38,16 @@ def test_e2e_workflows(env_dict): if len(pod_list.items) <= 0: pytest.fail("number of items in pod list should be greater than 0") + if len(pod_list.items[0].spec.containers) < 1: + pytest.fail("number of containers in pod item should be at least 1") + envVars = pod_list.items[0].spec.containers[0].env + if (len(pod_list.items[0].spec.containers) > 1): + for container in pod_list.items[0].spec.containers: + if (container.name == constants.OMSAGENT_MAIN_CONTAINER_NAME): + envVars = container.env + break + if not envVars: pytest.fail("environment variables should be defined in the replicaset pod") diff --git a/test/e2e/src/tests/test_node_metrics_e2e_workflow.py b/test/e2e/src/tests/test_node_metrics_e2e_workflow.py index dfcc89dde..264abad6b 100755 --- a/test/e2e/src/tests/test_node_metrics_e2e_workflow.py +++ b/test/e2e/src/tests/test_node_metrics_e2e_workflow.py @@ -12,6 +12,8 @@ pytestmark = pytest.mark.agentests # validation of node metrics e2e workflow + + def test_node_metrics_e2e_workflow(env_dict): print("Starting node metrics e2e workflow test.") append_result_output("test_node_metrics_e2e_workflow start \n", @@ -39,7 +41,16 @@ def test_node_metrics_e2e_workflow(env_dict): if len(pod_list.items) <= 0: pytest.fail("number of items in pod list should be greater than 0") + if len(pod_list.items[0].spec.containers) < 1: + pytest.fail("number of containers in pod item should be at least 1") + envVars = pod_list.items[0].spec.containers[0].env + if (len(pod_list.items[0].spec.containers) > 1): + for container in pod_list.items[0].spec.containers: + if (container.name == constants.OMSAGENT_MAIN_CONTAINER_NAME): + envVars = container.env + break + if not envVars: pytest.fail( "environment variables should be defined in the replicaset pod") @@ -71,9 +82,11 @@ def test_node_metrics_e2e_workflow(env_dict): pytest.fail("access_token shouldnt be null or empty") waitTimeSeconds = env_dict['AGENT_WAIT_TIME_SECS'] - print("start: waiting for seconds: {} for agent workflows to get emitted".format(waitTimeSeconds)) + print("start: waiting for seconds: {} for agent workflows to get emitted".format( + waitTimeSeconds)) time.sleep(int(waitTimeSeconds)) - print("complete: waiting for seconds: {} for agent workflows to get emitted".format(waitTimeSeconds)) + print("complete: waiting for seconds: {} for agent workflows to get emitted".format( + waitTimeSeconds)) # validate metrics e2e workflow now = datetime.utcnow() @@ -105,8 +118,8 @@ def test_node_metrics_e2e_workflow(env_dict): "response of the metrics query API shouldnt be null or empty") if response.status_code != 200: - pytest.fail("metrics query API failed with an error code: {}".format( - response.status_code)) + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) responseJSON = response.json() if not responseJSON: @@ -122,18 +135,21 @@ def test_node_metrics_e2e_workflow(env_dict): pytest.fail("response JSON shouldnt be null or empty") if len(responseValues) <= 0: - pytest.fail("length of value array in the response should be greater than 0") + pytest.fail( + "length of value array in the response should be greater than 0") for responseVal in responseValues: metricName = responseVal['name']['value'] if metricName != constants.NODE_MEMORY_RSS_METRIC_NAME: - pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_RSS_METRIC_NAME)) + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format( + metricName, constants.NODE_MEMORY_RSS_METRIC_NAME)) timeseries = responseVal['timeseries'] if not timeseries: pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) if len(timeseries) <= 0: - pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format( + constants.NODE_MEMORY_RSS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) # node metric - memoryRssPercentage custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( @@ -154,8 +170,8 @@ def test_node_metrics_e2e_workflow(env_dict): "response of the metrics query API shouldnt be null or empty") if response.status_code != 200: - pytest.fail("metrics query API failed with an error code: {}".format( - response.status_code)) + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) responseJSON = response.json() if not responseJSON: @@ -171,18 +187,21 @@ def test_node_metrics_e2e_workflow(env_dict): pytest.fail("response JSON shouldnt be null or empty") if len(responseValues) <= 0: - pytest.fail("length of value array in the response should be greater than 0") + pytest.fail( + "length of value array in the response should be greater than 0") for responseVal in responseValues: metricName = responseVal['name']['value'] if metricName != constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME: - pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME)) + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format( + metricName, constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME)) timeseries = responseVal['timeseries'] if not timeseries: pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) if len(timeseries) <= 0: - pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format( + constants.NODE_MEMORY_RSS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) # node metric - memoryWorkingSetBytes custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( @@ -199,11 +218,12 @@ def test_node_metrics_e2e_workflow(env_dict): headers=Headers) if not response: - pytest.fail("response of the metrics query API shouldnt be null or empty") + pytest.fail( + "response of the metrics query API shouldnt be null or empty") if response.status_code != 200: - pytest.fail("metrics query API failed with an error code: {}".format( - response.status_code)) + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) responseJSON = response.json() if not responseJSON: @@ -219,18 +239,21 @@ def test_node_metrics_e2e_workflow(env_dict): pytest.fail("response JSON shouldnt be null or empty") if len(responseValues) <= 0: - pytest.fail("length of value array in the response should be greater than 0") + pytest.fail( + "length of value array in the response should be greater than 0") for responseVal in responseValues: metricName = responseVal['name']['value'] if metricName != constants.NODE_MEMORY_WS_METRIC_NAME: - pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_WS_METRIC_NAME)) + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format( + metricName, constants.NODE_MEMORY_WS_METRIC_NAME)) timeseries = responseVal['timeseries'] if not timeseries: pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( constants.NODE_MEMORY_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) if len(timeseries) <= 0: - pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORYE_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format( + constants.NODE_MEMORYE_WS_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) # node metric - memoryWorkingSetPercentage custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( @@ -247,11 +270,12 @@ def test_node_metrics_e2e_workflow(env_dict): headers=Headers) if not response: - pytest.fail("response of the metrics query API shouldnt be null or empty") + pytest.fail( + "response of the metrics query API shouldnt be null or empty") if response.status_code != 200: - pytest.fail("metrics query API failed with an error code: {}".format( - response.status_code)) + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) responseJSON = response.json() if not responseJSON: @@ -267,18 +291,21 @@ def test_node_metrics_e2e_workflow(env_dict): pytest.fail("response JSON shouldnt be null or empty") if len(responseValues) <= 0: - pytest.fail("length of value array in the response should be greater than 0") + pytest.fail( + "length of value array in the response should be greater than 0") for responseVal in responseValues: metricName = responseVal['name']['value'] if metricName != constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME: - pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME)) + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format( + metricName, constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME)) timeseries = responseVal['timeseries'] if not timeseries: pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) if len(timeseries) <= 0: - pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format( + constants.NODE_MEMORY_WS_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) # node metric - cpuUsageMilliCores custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( @@ -295,10 +322,12 @@ def test_node_metrics_e2e_workflow(env_dict): headers=Headers) if not response: - pytest.fail("response of the metrics query API shouldnt be null or empty") + pytest.fail( + "response of the metrics query API shouldnt be null or empty") if response.status_code != 200: - pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) responseJSON = response.json() if not responseJSON: @@ -314,18 +343,21 @@ def test_node_metrics_e2e_workflow(env_dict): pytest.fail("response JSON shouldnt be null or empty") if len(responseValues) <= 0: - pytest.fail("length of value array in the response should be greater than 0") + pytest.fail( + "length of value array in the response should be greater than 0") for responseVal in responseValues: metricName = responseVal['name']['value'] if metricName != constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME: - pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME)) + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format( + metricName, constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME)) timeseries = responseVal['timeseries'] if not timeseries: pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) if len(timeseries) <= 0: - pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format( + constants.NODE_CPU_USAGE_MILLI_CORES_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) # node metric - cpuUsagePercentage custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( @@ -342,10 +374,12 @@ def test_node_metrics_e2e_workflow(env_dict): headers=Headers) if not response: - pytest.fail("response of the metrics query API shouldnt be null or empty") + pytest.fail( + "response of the metrics query API shouldnt be null or empty") if response.status_code != 200: - pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) responseJSON = response.json() if not responseJSON: @@ -361,18 +395,21 @@ def test_node_metrics_e2e_workflow(env_dict): pytest.fail("response JSON shouldnt be null or empty") if len(responseValues) <= 0: - pytest.fail("length of value array in the response should be greater than 0") + pytest.fail( + "length of value array in the response should be greater than 0") for responseVal in responseValues: metricName = responseVal['name']['value'] if metricName != constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME: - pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME)) + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format( + metricName, constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME)) timeseries = responseVal['timeseries'] if not timeseries: pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) if len(timeseries) <= 0: - pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format( + constants.NODE_CPU_USAGE_PERCENTAGE_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) # node metric - nodesCount custommetricsUrl = '{0}{1}/providers/microsoft.Insights/metrics?timespan={2}/{3}&interval=FULL&metricnames={4}&aggregation={5}&metricNamespace={6}&validatedimensions=false&api-version={7}'.format( @@ -389,10 +426,12 @@ def test_node_metrics_e2e_workflow(env_dict): headers=Headers) if not response: - pytest.fail("response of the metrics query API shouldnt be null or empty") + pytest.fail( + "response of the metrics query API shouldnt be null or empty") if response.status_code != 200: - pytest.fail("metrics query API failed with an error code: {}".format(response.status_code)) + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) responseJSON = response.json() if not responseJSON: @@ -408,18 +447,21 @@ def test_node_metrics_e2e_workflow(env_dict): pytest.fail("response JSON shouldnt be null or empty") if len(responseValues) <= 0: - pytest.fail("length of value array in the response should be greater than 0") + pytest.fail( + "length of value array in the response should be greater than 0") for responseVal in responseValues: metricName = responseVal['name']['value'] if metricName != constants.NODE_COUNT_METRIC_NAME: - pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.NODE_COUNT_METRIC_NAME)) + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format( + metricName, constants.NODE_COUNT_METRIC_NAME)) timeseries = responseVal['timeseries'] if not timeseries: pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) if len(timeseries) <= 0: - pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format( + constants.NODE_COUNT_METRIC_NAME, constants.NODE_METRICS_NAMESPACE)) append_result_output("test_node_metrics_e2e_workflow end \n", env_dict['TEST_AGENT_LOG_FILE']) diff --git a/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py b/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py index 81e2b77a9..4be36b8a9 100755 --- a/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py +++ b/test/e2e/src/tests/test_pod_metrics_e2e_workflow.py @@ -12,6 +12,8 @@ pytestmark = pytest.mark.agentests # validation of pod metrics e2e workflows + + def test_pod_metrics_e2e_workflow(env_dict): print("Starting pod metrics e2e workflows test.") append_result_output("test_pod_metrics_e2e_workflow start \n", @@ -39,7 +41,16 @@ def test_pod_metrics_e2e_workflow(env_dict): if len(pod_list.items) <= 0: pytest.fail("number of items in pod list should be greater than 0") + if len(pod_list.items[0].spec.containers) < 1: + pytest.fail("number of containers in pod item should be at least 1") + envVars = pod_list.items[0].spec.containers[0].env + if (len(pod_list.items[0].spec.containers) > 1): + for container in pod_list.items[0].spec.containers: + if (container.name == constants.OMSAGENT_MAIN_CONTAINER_NAME): + envVars = container.env + break + if not envVars: pytest.fail( "environment variables should be defined in the replicaset pod") @@ -71,9 +82,11 @@ def test_pod_metrics_e2e_workflow(env_dict): pytest.fail("access_token shouldnt be null or empty") waitTimeSeconds = env_dict['AGENT_WAIT_TIME_SECS'] - print("start: waiting for seconds: {} for agent workflows to get emitted".format(waitTimeSeconds)) + print("start: waiting for seconds: {} for agent workflows to get emitted".format( + waitTimeSeconds)) time.sleep(int(waitTimeSeconds)) - print("complete: waiting for seconds: {} for agent workflows to get emitted".format(waitTimeSeconds)) + print("complete: waiting for seconds: {} for agent workflows to get emitted".format( + waitTimeSeconds)) # validate metrics e2e workflow now = datetime.utcnow() @@ -104,8 +117,8 @@ def test_pod_metrics_e2e_workflow(env_dict): "response of the metrics query API shouldnt be null or empty") if response.status_code != 200: - pytest.fail("metrics query API failed with an error code: {}".format( - response.status_code)) + pytest.fail("metrics query API failed with an error code: {}".format( + response.status_code)) responseJSON = response.json() if not responseJSON: @@ -121,18 +134,21 @@ def test_pod_metrics_e2e_workflow(env_dict): pytest.fail("response JSON shouldnt be null or empty") if len(responseValues) <= 0: - pytest.fail("length of value array in the response should be greater than 0") + pytest.fail( + "length of value array in the response should be greater than 0") for responseVal in responseValues: metricName = responseVal['name']['value'] if metricName != constants.POD_COUNT_METRIC_NAME: - pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format(metricName, constants.POD_COUNT_METRIC_NAME)) + pytest.fail("got the metricname: {0} but expected metricname:{1} in the response".format( + metricName, constants.POD_COUNT_METRIC_NAME)) timeseries = responseVal['timeseries'] if not timeseries: pytest.fail("metric series shouldnt be null or empty for metric:{0} in namespace: {1}".format( constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) if len(timeseries) <= 0: - pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format(constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) + pytest.fail("length of timeseries should be greater than for 0 for metric: {0} in namespace :{1}".format( + constants.POD_COUNT_METRIC_NAME, constants.POD_METRICS_NAMESPACE)) append_result_output("test_pod_metrics_e2e_workflow end \n", env_dict['TEST_AGENT_LOG_FILE'])