Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix) 맴버클러스터의 3시간 이상된 데이터를 볼수 없는 상황해소 #250

Merged
merged 7 commits into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 0 additions & 173 deletions deploy_apps/tks-lma-federation-wftpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -311,179 +311,6 @@ spec:
echo "$thanos_sc_ep:$THANOS_SC_PORT" > /mnt/out/thanos_sc_ep.txt
fi

- name: collectThanosScEndpoints
inputs:
parameters:
- name: tks_info_host
- name: app_group_id
outputs:
parameters:
- name: outwards_cluster_list
valueFrom:
path: /mnt/out/cluster_list.txt
- name: inwards_endpoint_map
valueFrom:
path: /mnt/out/inwards_endpoint.txt
- name: cur_cluster_name
valueFrom:
path: /mnt/out/cur_cluster_name.txt
volumes:
- name: out
emptyDir: {}
script:
name: 'collect'
image: harbor.taco-cat.xyz/tks/centos-tks-api:v1.0
command: ["python"]
envFrom:
- secretRef:
name: "git-svc-token"
- secretRef:
name: "tks-api-secret"
volumeMounts:
- name: out
mountPath: /mnt/out
source: |
import sys
import os
import git
import requests
import json

TKS_API_URL = "{{workflow.parameters.tks_info_host}}"
CLUSTER_ID = "{{workflow.parameters.cluster_id}}"
#CLUSTER_ID = "caldcde6u"
#CLUSTER_ID = "c6fk1w3dm"

def getToken() :
data = {
'organizationId' : os.environ['ORGANIZATION_ID'],
'accountId': os.environ['ACCOUNT_ID'],
'password' : os.environ['PASSWORD']
}

res = requests.post(TKS_API_URL+"/api/1.0/auth/login", json = data )
if res.status_code != 200 :
return ''
resJson = res.json()
return resJson['user']['token']

output_cluster_list = []
temp_map = {}
inwards_endpoint_list = []
inwards_endpoint_map = {}
outwards_endpoint_map = {}

TOKEN=getToken()

res = requests.get(TKS_API_URL+"/api/1.0/clusters/" + CLUSTER_ID,
headers={"Authorization": "Bearer " + TOKEN} )
if res.status_code != 200 :
sys.exit('Failed to get cluster')

cluster = res.json()['cluster']
print( cluster )
organizationId = cluster['organizationId']
cur_cluster_name = cluster['id']

res = requests.get(TKS_API_URL+"/api/1.0/clusters?organizationId=" + organizationId,
headers={"Authorization": "Bearer " + TOKEN} )
if res.status_code != 200 :
sys.exit('Failed to get clusters')

clusters = res.json()['clusters']

print("Iterating over clusters in the same contract...")

# Iterate over cluster list except current cluster #
for cluster in clusters:
if cluster['status'] != "RUNNING":
continue

if cluster['id'] != CLUSTER_ID :
print("*******************************************")
print("Checking cluster: {}".format(cluster['id']))

gitBaseUrl = os.environ['GIT_SVC_URL'].replace("http://","")
print( gitBaseUrl )


print("Checking if corresponding cluster repo exists..")
#url = "@github.com/{{workflow.parameters.github_account}}/{}".format(cluster['id'])
url = "@" + gitBaseUrl + "/" + os.environ['USERNAME'] + "/" + cluster['id']
print( url )

repoUrl = "http://" + os.environ['TOKEN'] + url
try:
repo = git.Repo.clone_from(repoUrl, './tempcluster')

except git.exc.GitCommandError as e:
print(e)
print("Repo {} doesn't exist. Skipping this cluster..".format(repoUrl))
continue

res = requests.get(TKS_API_URL+"/api/1.0/app-groups?clusterId=" + cluster['id'],
headers={"Authorization": "Bearer " + TOKEN} )
if res.status_code != 200 :
print( 'Failed to get appgroups for cluster ')
continue

appGroups = res.json()['appGroups']
print( appGroups )

os.system("rm -rf ./tempcluster")

# Check if LMA group exists.
for appGroup in appGroups:
if appGroup['appGroupType'] == "LMA" :
print("Found LMA appGroup: {}".format(appGroup['name']))

res = requests.get(TKS_API_URL+"/api/1.0/app-groups/" + appGroup['id'] + "/applications?applicationType=PROMETHEUS",
headers={"Authorization": "Bearer " + TOKEN} )
if res.status_code != 200 :
print( 'Failed to get applications for appgroup')
continue

applications = res.json()['applications']
if applications :
# This is based on the premise that there's only one prometheus per appGroup.
endpoint = applications[0]['endpoint']
print("Get Thanos-sc endpoint: {}. Appending it to inward list.".format(endpoint))

# Add this cluster's endpoint to endpoint map
inwards_endpoint_list.append(endpoint)

# Add this cluster to outward list so that current ep is updated to this cluster
temp_map["name"] = cluster['id']
str_json = json.dumps(temp_map)
output_cluster_list.append(str_json)


# Compose profer format to be used as input on next step
inwards_endpoint_map['querier.stores'] = inwards_endpoint_list

###########################
# Construct output params #
###########################
len_list = len(output_cluster_list)

with open("/mnt/out/cluster_list.txt", "w") as f:
f.write('[')

print("*** Outwards Cluster List ***")
for idx, item in enumerate(output_cluster_list, start=1):
print("item {}: {}".format(idx, item))
f.write(item.strip("'"))
if idx < len_list:
f.write(',')
f.write(']')

with open("/mnt/out/inwards_endpoint.txt", "w") as f:
str_inwards_endpoint = repr(inwards_endpoint_map)
f.write(str_inwards_endpoint)

with open("/mnt/out/cur_cluster_name.txt", "w") as f:
f.write(cur_cluster_name)

- name: create-keycloak-client
activeDeadlineSeconds: 600
inputs:
Expand Down
159 changes: 130 additions & 29 deletions deploy_apps/tks-primary-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,6 @@ spec:
]
when: "{{workflow.parameters.object_store}} == s3"

# TODO: 전체 완성을 위해서는 아래내역을 구현하여 동적인 bucket을 만드는 방식으로 구현해야 하지만
# 5월 오픈전 가능한 형상을 위해 협의한 바(아래)에 따라 본부분은 기존 준비됀 것을 사용하는 것으로 구현하고 추후 수정하다.
# 1. 사용자가 생성하는 첫번째 클러스터는 primary cluster
# 2. primary cluster는 계약이 종료되기 전까지 임의 삭제불가
# 3. 개별 클러스터에서 수행되는 모니터링은 없고 계약단위에서 수행되어야 함
# 하지만 이부분에 datasource 바꿔주는 부분을 포함하고 있으므로 일단 한번 타야할듯...
- - name: change-target
template: change-logging-target
arguments:
Expand Down Expand Up @@ -216,6 +210,28 @@ spec:
- name: primary_cluster
- name: member_clusters
steps:

- - name: change-thanos-sidecar
template: sub-change-thanos-sidecar
arguments:
parameters:
- name: primary_cluster
value: '{{inputs.parameters.primary_cluster}}'
- name: member_clusters
value: '{{inputs.parameters.member_clusters}}'

- - name: render-current-cluster
templateRef:
name: event-gitea-render-manifests
template: main
arguments:
parameters:
- name: decapod_site_repo
value: "{{ workflow.parameters.github_account }}/{{ workflow.parameters.cluster_id }}"
- name: base_repo_branch
value: "{{ workflow.parameters.base_repo_branch }}"
when: "{{steps.change-thanos-sidecar.outputs.parameters.changed}} != 'NO_CHANGE_HERE'" # 이미 변경내역이 반영된 (한번 수행됐던) 클러스터라면 랜더링은 필요없음

- - name: sync-organization-changes
template: sub-sync-organization-changes
arguments:
Expand All @@ -232,10 +248,10 @@ spec:
arguments:
parameters:
- name: decapod_site_repo
value: "{{ workflow.parameters.github_account }}/{{steps.sync-organization-changes.outputs.parameters.primary_cluster}}"
value: "{{ workflow.parameters.github_account }}/{{steps.sync-organization-changes.outputs.parameters.changed}}"
- name: base_repo_branch
value: "{{ workflow.parameters.base_repo_branch }}"
when: "{{steps.sync-organization-changes.outputs.parameters.primary_cluster}} != 'NO_CHANGE_HERE'"
when: "{{steps.sync-organization-changes.outputs.parameters.changed}} != 'NO_CHANGE_HERE'"

#######################
# Template Definition #
Expand All @@ -258,22 +274,9 @@ spec:
retryStrategy:
limit: 2

# - name: sub-prepare-bucket
# inputs:
# parameters:
# - name: primary_cluster
# container:
# name: prepare-bucket
# image: harbor.taco-cat.xyz/tks/hyperkube:v1.18.6
# command:
# - /bin/bash
# - '-c'
# - |
# echo "prepare bucket for the '{{workflow.parameters.organization_id}}' (clusters: '{{inputs.parameters.primary_cluster}}')"
# activeDeadlineSeconds: 900
# retryStrategy:
# limit: 2

# function sub-pre-change-logging-target
# 1. Change endpoint of fluentbit-output (all in org.)
# 2. Change endpoint of thanos-sidecar in prometheus-pod (all in org.)
- name: sub-pre-change-logging-target
inputs:
parameters:
Expand Down Expand Up @@ -546,6 +549,104 @@ spec:
path: /mnt/out/modified_cluster_list.txt
activeDeadlineSeconds: 900

- name: sub-change-thanos-sidecar
inputs:
parameters:
- name: primary_cluster
- name: member_clusters
container:
name: logging-target-changer
image: harbor.taco-cat.xyz/tks/shyaml_jq_yq_kubectl_python:3.11
command:
- /bin/bash
- '-c'
- |
#/bin/bash

set -ex

function log() {
level=$1
msg=$2
date=$(date '+%F %H:%M:%S')
echo "[$date] $level $msg"
}

current_cluster={{workflow.parameters.cluster_id}}
primary_cluster={{inputs.parameters.primary_cluster}}
member_clusters="{{inputs.parameters.member_clusters}}"
empty_char=

if [ -z ${primary_cluster} ] || [ "${primary_cluster}" = "$empty_char" ]; then
primary_cluster=${current_cluster}
fi

S3_Service="s3://ap-northeast-2"
cp /kube/value kubeconfig_adm
export KUBECONFIG=kubeconfig_adm

#################
# updates
#################
GIT_ACCOUNT={{workflow.parameters.github_account}}
if [[ $GIT_SVC_URL == https://* ]]; then
repository_base=https://${TOKEN//[$'\t\r\n ']}@${GIT_SVC_URL/http:\/\//}/${GIT_ACCOUNT}/
else
repository_base=http://${TOKEN//[$'\t\r\n ']}@${GIT_SVC_URL/http:\/\//}/${GIT_ACCOUNT}/
fi

log "INFO" "##### change the loki target to $LOKI_HOST:$LOKI_PORT and $S3_Service (the current target is ${current_cluster})"
[ -d ${current_cluster} ] || git clone ${repository_base}${current_cluster}
cd ${current_cluster}

yq -i e "del(.charts[] | select(.name == \"thanos-config\").override.objectStorage)" ${current_cluster}/lma/site-values.yaml
yq -i e ".charts |= map(select(.name == \"thanos-config\").override.objectStorage.type=\"s3\")" ${current_cluster}/lma/site-values.yaml
yq -i e ".charts |= map(select(.name == \"thanos-config\").override.objectStorage.rawConfig.endpoint=\"s3.ap-northeast-2.amazonaws.com\")" ${current_cluster}/lma/site-values.yaml
yq -i e ".charts |= map(select(.name == \"thanos-config\").override.objectStorage.rawConfig.region=\"ap-northeast-2\")" ${current_cluster}/lma/site-values.yaml
yq -i e ".charts |= map(select(.name == \"thanos-config\").override.objectStorage.rawConfig.bucket=\"${primary_cluster}-tks-thanos\")" ${current_cluster}/lma/site-values.yaml
yq -i e ".charts |= map(select(.name == \"thanos-config\").override.objectStorage.rawConfig.signature_version2=false)" ${current_cluster}/lma/site-values.yaml

git config --global user.name "tks"
git config --global user.email "[email protected]"

if [[ `git status --porcelain` ]]; then
log "INFO" "##### commit changes on ${current_cluster} to use s3"
cmessage="changes on ${current_cluster} to use s3"
git add ${current_cluster}/lma/site-values.yaml
git commit -m "change loki and thanos endpoints. (by set-primary workflow)" -m "$cmessage"
git push
modified_clusters=${current_cluster}
# echo -n "${current_cluster} " >> /mnt/out/modified_cluster_list.txt
else
log "INFO" "No change on the cluster ${current_cluster}"
echo NO_CHANGE_HERE > /mnt/out/modified_cluster_list.txt
fi
cd -
rm -rf ${current_cluster}

jq -n '$ARGS.positional' --args $modified_clusters > /mnt/out/modified_cluster_list.txt

env:
- name: OBJECT_SOTRE
value: "{{workflow.parameters.object_store}}"
envFrom:
- secretRef:
name: "git-svc-token"
volumeMounts:
- name: kubeconfig-adm
mountPath: "/kube"
- name: out
mountPath: /mnt/out
volumes:
- name: out
emptyDir: {}
outputs:
parameters:
- name: changed
valueFrom:
path: /mnt/out/modified_cluster_list.txt
activeDeadlineSeconds: 900

- name: sub-sync-organization-changes
inputs:
parameters:
Expand Down Expand Up @@ -658,10 +759,10 @@ spec:
git add ${primary_cluster}/lma/site-values.yaml
git commit -m "change thanos-query stores. (by set-primary workflow)" -m "$cmessage"
git push
echo ${primary_cluster} > /mnt/out/primary_cluster.txt
echo ${primary_cluster} > /mnt/out/changed.txt
else
log "INFO" "No change on the cluster ${member}"
echo NO_CHANGE_HERE > /mnt/out/primary_cluster.txt
echo NO_CHANGE_HERE > /mnt/out/changed.txt
fi

if [ "$OBJECT_SOTRE" != "s3" ]; then
Expand All @@ -673,7 +774,7 @@ spec:
git add ${primary_cluster}/lma/site-values.yaml
git commit -m "change iamRoles(s3). (by set-primary workflow)" -m "$cmessage"
git push
echo ${primary_cluster} > /mnt/out/primary_cluster.txt
echo ${primary_cluster} > /mnt/out/changed.txt
else
log "INFO" "(iamRoles) No change on the cluster ${member}"
fi
Expand Down Expand Up @@ -702,9 +803,9 @@ spec:
emptyDir: {}
outputs:
parameters:
- name: primary_cluster
- name: changed
valueFrom:
path: /mnt/out/primary_cluster.txt
path: /mnt/out/changed.txt
activeDeadlineSeconds: 900


Expand Down