-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_liquibase_data.py
executable file
·267 lines (217 loc) · 11.6 KB
/
generate_liquibase_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
from dateutil.relativedelta import relativedelta
from datetime import date, datetime, timedelta
from math import log, exp
import argparse
import ndjson
import string
import random
import numpy
import json
import yaml
import time
import os
#----------------------------------------------------------------------------------------------------
# GLOBAL VARIABLES
#----------------------------------------------------------------------------------------------------
########################################################
# NEED TO IMPLEMENT
# • Rollbacks trend down over time
# • Fix multiple entries per changesetID
########################################################
PATH_TO_MDCLOGFILES = "./mdclogfiles/"
# 1 cycle = 1 hour
CYCLES = 1680
deployments_per_cycle = 1
# The DEPLOYMENT_TREND indicates how often we increase the number of docs created per cycle
DEPLOYMENT_TREND = 0.002
# Variance allows the data to have some realistic randomness per cycle
variance = 1
# Used to generate realistic log URLS
URL = "://rdbms.us-west-2.amazonaws.com:"
DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ'
# All the types of commands that we can generate and how often we see them
ALLCOMMANDS = {"rollbacktargeted":"","rollbackcount":"","rollback":"","updatetotag":"","updatecount":"","update-one":"","update":"","changelogsync":""}
RBT = ["rollbacktargeted"] * 5
RBC = ["rollbackcount"] * 3
RB = ["rollback"] * 1
UDT = ["updatetotag"] * 15
UDC = ["updatecount"] * 3
UDO = ["update-one"] * 1
UD = ["update"] * 45
CLS = ["changelogsync"] * 8
COMMAND_DISTRIBUTION = RBT + RBC + RB + UDT + UDC + UDO + UD + CLS
# How often a command results in a failure (vs. success)
FAILURE_RATE = 0.005
# All the types of databases that we can generate (as well as associated info) and how often we see them
MONGODB = ["mongodb"] * 10
ORACLE = ["oracle"] * 7
MYSQL = ["mysql"] * 5
SNOWFLAKE = ["snowflake"] * 2
DATABASE_DISTRIBUTION = MONGODB + ORACLE + SNOWFLAKE
DATABASE_NUMBERS = {"mongodb": 2, "oracle": 2, "mysql": 2, "mariadb": 2, "postgresql": 2, "db2z": 2, "neo4j": 2, "snowflake": 2}
DATABASE_PORTS = {"mongodb":"27017","oracle":"1521","mysql":"3306","mariadb":"3306","postgresql":"5432","db2z":"25000","neo4j":"7474","snowflake":"443"}
DATABASE_NAMES = ["db1"]
# All the names of apps that we can generate and how often we see them
APP_NAMES_LIST = ["Commercial-CMS","Commercial-OMS","Commercial-Analytics","Commercial-RMI","Commercial-BI","Commercial-Onyx","Commercial-CyclopsBI","Commercial-GambitCard","Commercial-GauntletComm","Commercial-StarkWealth","Commercial-Students","Commercial-Billing","Global-Risk","Commercial-UMS","Global-UWS","Global-WWW","Global-OmniAuth","Global-OAS","Global-MYRP","Global-ASDE","Global-WIBAC","Global-WST","Global-FHCT","Global-MAA","Global-ANG","Global-BillPay","Wealth-DBRDC","Wealth-SMS","Wealth-OFX","Wealth-TNG","Wealth-DBRDC/WIBT","Wealth-OSMP","Consumer-CAES","Consumer-AOESN","Consumer-AOESP","Consumer-MTP","Risk-OPS","Risk-OPSPD","Risk-1BAAS","Risk-GUSER","Management-PSGCD","Management-ISGSM"]
APP_NAMES = []
for app in APP_NAMES_LIST:
APP_NAMES += [app]*random.randrange(20)
# All the names of authors that we can generate and how often we see them
AA = [["arthur.titian.mcfly", "atm"]] * 3
BB = [["hammurabi.walganus.quijote", "hwq"]] * 19
CC = [["guinevere.idril.poirot", "gip"]] * 4
DD = [["zara.valentinian.finch", "zvf"]] * 5
EE = [["xerxes.olivette.nickleby", "xon"]] * 12
FF = [["aida.lionel.twist", "alt"]] * 9
GG = [["galadriel.merry.skywalker", "gms"]] * 13
HH = [["sauron.justinian.maximus", "sjm"]] * 7
II = [["caspian.malvolio.clovis", "cmc"]] * 6
JJ = [["kratos.hyacintha.scrooge", "khs"]] * 2
KK = [["sherlock.dulcinea.gump", "sdg"]] * 8
AUTHOR_DISTRIBUTION = AA + BB + CC + DD + EE + FF + GG + HH + II + JJ + KK
# All the names of environments that we can generate and how often we see them
DDEV_ENV = ["dev"] * 4
TEST_ENV = ["test"] * 3
STAG_ENV = ["stag"] * 2
PROD_ENV = ["prod"] * 1
ENVIRONMENTS = DDEV_ENV + TEST_ENV + STAG_ENV + PROD_ENV
# All the types of file endings that we can generate and how often we see them
FILETYPES = ['.yml','.sql','.json','.xml']
# Starting number for deployment ID's
deployment_id = 9429453980
changeset_id_tracker = {}
# Used to make sure that app names are consistent across environments, which is important for calculating Lead Time
environment_tracker = {"dev":{"zizurofqcz":"Commercial-CMS"},"test":{"hreruneqnz":"Commercial-OMS"},"stag":{"fpqaulqprt":"Commercial-Analytics"},"prod":{"crnjndgtkz":"Commercial-RMI"}}
environment_incrementer = {"dev":"test","test":"stag","stag":"prod"}
environment_probability = {"dev":0.5,"test":0.75,"stag":0.9,"prod":1}
#----------------------------------------------------------------------------------------------------
# FUNCTION DEFINITIONS
#----------------------------------------------------------------------------------------------------
def get_liquibase_commands (command, filename):
with open(os.path.join(PATH_TO_MDCLOGFILES, filename), 'r') as f:
for line in f:
ALLCOMMANDS[command] += line
def get_date(hour, minute):
temp_date = (datetime.today() - timedelta(hours=+(CYCLES-hour)))
return (temp_date - timedelta(minutes=+(minute))).strftime(DATE_FORMAT)
def decision(probability):
return (random.random() < probability)
def randomword(length):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(length))
#----------------------------------------------------------------------------------------------------
# READ IN COMMANDS FROM FILE
#----------------------------------------------------------------------------------------------------
for filename in os.listdir(PATH_TO_MDCLOGFILES):
# with open(os.path.join(PATH_TO_MDCLOGFILES, filename), 'r') as f:
if "rollbacktargeted" in filename:
get_liquibase_commands("rollbacktargeted", filename)
elif "rollbackcount" in filename:
get_liquibase_commands("rollbackcount", filename)
elif "rollback" in filename:
get_liquibase_commands("rollback", filename)
elif "updatetotag" in filename:
get_liquibase_commands("updatetotag", filename)
elif "updatecount" in filename:
get_liquibase_commands("updatecount", filename)
elif "update-one" in filename:
get_liquibase_commands("update-one", filename)
elif "update" in filename:
get_liquibase_commands("update", filename)
elif "changelogsync" in filename:
get_liquibase_commands("changelogsync", filename)
elif ".DS_Store" in filename:
if False:
print(filename)
else:
print("!!!!!!!!!ERROR!!!!!!!!!" + filename)
#----------------------------------------------------------------------------------------------------
# MAIN LOGIC
#----------------------------------------------------------------------------------------------------
# 1 cycle = 1 hour
for cycle in range(1,CYCLES + 1):
# We want the number of entries data to trend over time: DEPLOYMENT_TREND is a percentage that represents how often we increase the deployments_per_cycle
if random.random() < DEPLOYMENT_TREND:
deployments_per_cycle += 1
# As the number of deployments_per_cycle grows, we shrink the variance otherwise the amount of documents we generate can get kind of crazy
variance = (1/(deployments_per_cycle+1))+(0.3)
# Calculate the number of deployments for this cycle based on the variance
actual_number_of_deployments = deployments_per_cycle + round((random.uniform(-variance, variance)*deployments_per_cycle))
# generate all the data for each deployment
for deployment in range(1, actual_number_of_deployments + 1):
deployment_id += 1
author = random.choice(AUTHOR_DISTRIBUTION)[1]
command_choice = random.choice(COMMAND_DISTRIBUTION)
commands = ndjson.loads(ALLCOMMANDS[command_choice])
count = 0
database = random.choice(DATABASE_DISTRIBUTION)
database_port = DATABASE_PORTS[database]
database_enpoints = []
environment = random.choice(ENVIRONMENTS)
app_name = random.choice(APP_NAMES)
changelog_filename = ""
# Ensure that a particular app works it's way across all environments
if (environment_tracker[environment] and decision(environment_probability[environment])):
changelog_object = environment_tracker[environment].popitem()
changelog_filename = changelog_object[0]
app_name = changelog_object[1]
if environment in environment_incrementer:
environment_tracker[environment_incrementer[environment]][changelog_filename] = app_name
else:
environment = "dev"
changelog_filename = randomword(10)
environment_tracker[environment][changelog_filename] = app_name
changelog_filepath = "liquibase/changelogs/" + app_name.lower() + "/" + environment + "/" + changelog_filename + random.choice(FILETYPES)
# For all the databases of a particular type, generate deployments
for db in range(1, DATABASE_NUMBERS[database] + 1):
count += 1
endpoint = "jdcb:" + database + URL + database_port + "/" + app_name + ":" + environment
# for each command we're working with, generate the appropriate data
for command in commands:
command["timestamp"] = get_date(cycle,count)
if 'commandContextFilter' in command:
command['commandContextFilter'] = environment
if 'commandLabelFilter' in command:
command['commandLabelFilter'] = app_name
if 'changesetId' in command:
command['changesetId'] = changelog_filename + str(count)
if 'liquibaseTargetUrl' in command:
command["liquibaseTargetUrl"] = endpoint
if "prod" in endpoint:
FAILURE_RATE = 0.0005
elif "stag" in endpoint:
FAILURE_RATE = 0.0025
elif "test" in endpoint:
FAILURE_RATE = 0.05
elif "dev" in endpoint:
FAILURE_RATE = 0.25
else:
FAILURE_RATE = 0.005
if 'changesetAuthor' in command:
command["changesetAuthor"] = author
if 'liquibaseSystemUser' in command:
command["liquibaseSystemUser"] = "liquibase_admin"
if decision(0.002):
command["liquibaseSystemUser"] = random.choice(AUTHOR_DISTRIBUTION)[0]
if 'liquibaseSystemName' in command:
command["liquibaseSystemName"] = database + "-db" + str(count) + "-docker-20.10.23"
if 'deploymentId' in command:
command["deploymentId"] = deployment_id
if 'changesetOperationStart' in command:
command["changesetOperationStart"] = get_date(cycle,count)
if 'changesetOperationStop' in command:
if decision(FAILURE_RATE):
command["changesetOperationStop"] = (datetime.strptime(command["changesetOperationStart"],DATE_FORMAT) + timedelta(minutes=random.randrange(100),seconds=random.randrange(100),microseconds=random.randrange(1000000))).strftime(DATE_FORMAT)
else:
command["changesetOperationStop"] = (datetime.strptime(command["changesetOperationStart"],DATE_FORMAT) + timedelta(seconds=random.randrange(100),microseconds=random.randrange(1000000))).strftime(DATE_FORMAT)
if 'changesetOperationStart' in command:
startTime = datetime.strptime(command["changesetOperationStart"],DATE_FORMAT)
if 'changesetOperationStop' in command:
stopTime = datetime.strptime(command["changesetOperationStop"],DATE_FORMAT)
diff = stopTime - startTime
command['changesetOperationDuration'] = int((diff.seconds * 1000) + (diff.microseconds / 1000))
if 'deploymentOutcome' in command:
if decision(FAILURE_RATE):
command["deploymentOutcome"] = "failure"
# for each command, print out the resulting command in NDJSON format
print(json.dumps(command))