-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.py
458 lines (381 loc) · 10.8 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
import sqlparse
import os
import csv
from itertools import product
import sys
table_info = {} #table name : table cols
table_names = [] #all tables in db
query_tables = [] #all the tables mentioned in query
dic = {} #cols : indices in cartesian product
aggregates = {} # col : aggregate func
#helper function for where conditions
def evaluate(c1,op,c2) :
if(op == '=') :
return (c1 == c2)
if(op == '>') :
return (c1 > c2)
if(op == '<') :
return (c1 < c2)
if(op == '>=') :
return (c1 >= c2)
if(op == '<=') :
return (c1 <= c2)
#Store metadata in a dictionary from metadata.txt file -
def getMetaData() :
flag = 0
table_name = ''
try :
meta_file = open('metadata.txt','r')
for i in meta_file :
data = i.strip()
if(data == '<begin_table>') :
flag = 1
continue
elif (data == '<end_table>') :
flag = 0
continue
elif (flag == 1) :
table_name = data
table_info[data] = []
table_names.append(data)
flag = 2
elif (flag == 2) :
table_info[table_name].append(data)
except IOError as error :
print(error)
#Process the given query
def processQuery(query) :
cartesian_product = []
query_cols = [] # columns present in select
query_table_cols = [] #all columns of the tables mentioned in query
token_dic = {}
check_query = query.lower()
where = False
distinct = False
group_by = False
order_by = False
aggregate_func = False
#Parse the query for further processing
query = sqlparse.parse(query)[0]
tokens = query.tokens
x = 0
for i in tokens :
temp = str(i).lower()
if('from' in temp) :
token_dic['from'] = x
elif('where' in temp) :
where = True
token_dic['where'] = x
elif('group by' in temp) :
group_by = True
token_dic['groupby'] = x
elif('order by' in temp) :
order_by = True
token_dic['orderby'] = x
x += 1
######### ERROR HANDLING : ############
#check if SEMICOLON is present in query
if(check_query[-1] != ';') :
print("Invalid query!Please add a semicolon at the end")
sys.exit(0)
#check if SELECT is written in query
if('select' not in check_query) :
print("Incorrect Syntax!No SELECT statement")
sys.exit(0)
#check if FROM is written in query
if('from' not in check_query):
print("Incorrect Syntax!No FROM statement")
sys.exit(0)
#check if DISTINCT is written in query
if('distinct' in check_query):
distinct = True
#check if any AGGREGATE FUNCTION is involved
if(('max' in check_query or 'min' in check_query or 'average' in check_query or 'count' in check_query or 'sum' in check_query) and ('group by' not in check_query)):
aggregate_func = True
#store table names
for k in str(tokens[token_dic['from'] + 2]).split(',') :
query_tables.append(k.strip())
#ERROR HANDLONG for tables - check if all query_tables are present -
for table in query_tables :
if table not in table_names :
print(table + " not found")
sys.exit(0)
#store the cols from the tables :
for table in query_tables :
temp = table_info[table]
for i in temp :
query_table_cols.append(i)
#take cross product of tables
crossProduct(query_tables,cartesian_product,dic)
#store col names
col = tokens[token_dic['from'] - 2]
for k in (str(col).split(',')):
p = str(k).lower()
#check for aggregate functions -
if('count' in p or 'sum' in p or 'min' in p or 'max' in p or 'average' in p) :
s = (str(k).split('('))
t = s[-1].split(')')[0]
query_cols.append(t.strip())
aggregates[t.strip()] = s[0]
else :
query_cols.append(str(k).strip())
#ERROR Handling for columns
if(len(query_cols) == 0) :
print("No column name mentioned")
sys.exit(0)
#Check if '*' is present
if(len(query_cols) == 1) :
#add all the query_cols to query_cols list
if(query_cols[0] == '*') :
query_cols = query_table_cols
#check if given query_cols are actually present in the given tables :
for i in query_cols :
if(i not in query_table_cols) :
print("Column " + i + " doesn't exist in the given tables")
sys.exit(0)
########### RROCESS WHERE ######### -
if(where) :
w = token_dic['where']
where_str = str(tokens[w])
#Process FIRST CONDITION in where
cond1part1 = str(tokens[w][2][0])
op1 = str(tokens[w][2][2])
cond1part2 = str(tokens[w][2][4])
cond_index_list1 = []
num_check1 = False
num1 = 0
if(str(cond1part1) in query_table_cols) :
cond_index_list1.append((dic[cond1part1]))
else :
print("Column name in WHERE doesn't exist")
if(str(cond1part2) in query_table_cols):
cond_index_list1.append(((dic[cond1part2])))
else :
num_check1 = True
num1 = int((cond1part2))
#Process SECOND CONDITION in where
if('AND' in where_str or 'OR' in where_str) :
cond2part1 = str(tokens[w][6][0])
op2 = str(tokens[w][6][2])
cond2part2 = str(tokens[w][6][4])
cond_index_list2 = []
num_check2 = False
num2 = 0
if(str(cond2part1) in query_table_cols) :
cond_index_list2.append((dic[cond2part1]))
else :
print("Column name in WHERE doesn't exist")
if(str(cond2part2) in query_table_cols):
cond_index_list2.append(((dic[cond2part2])))
else :
num_check2 = True
num2 = int((cond2part2))
#EVALUATE CONDITIONS :
#store temporary result
temp_res_list = []
#if the first condition is based on a constant value
for row in cartesian_product :
cond1_res = False
cond2_res = False
if(num_check1) :
if((evaluate(row[cond_index_list1[0]],(op1),num1))) :
cond1_res = True
else :
if((evaluate(row[cond_index_list1[0]],(op1),row[cond_index_list1[1]]))) :
cond1_res = True
if('AND' in where_str and cond1_res) :
if(num_check2) :
if((evaluate(row[cond_index_list2[0]],(op2),num2))) :
cond2_res = True
else :
if((evaluate(row[cond_index_list2[0]],(op2),row[cond_index_list2[1]]))) :
cond2_res = True
if(cond2_res) :
temp_res_list.append(row)
if('OR' in where_str and cond1_res) :
temp_res_list.append(row)
elif('OR' in where_str and cond2_res) :
temp_res_list.append(row)
if('AND' not in where_str and 'OR' not in where_str and cond1_res):
temp_res_list.append(row)
cartesian_product = temp_res_list
###### PROCESS GROUP BY : ########
if(group_by) :
col_index = ''
col = str(tokens[token_dic['groupby'] + 2])
try:
col_index = dic[str(tokens[token_dic['groupby'] + 2])]
except KeyError :
print("Column in group by clause doesn't exist ")
sys.exit(0)
#CHECK if col in group by is present in select :
if(col not in query_cols) :
print(col + " must also be present in Select")
sys.exit(0)
temp_res = []
ans_dict = {}
count = 1
#for every column in select calculate the aggregate fn value
#store that value in a dict (ans_dict)
for key in aggregates :
index = dic[key]
temp_dic = {}
aggr = aggregates[key]
#store key(group by col) : other column values in a dictionary
#to implement group by
for i in cartesian_product :
if(i[col_index] in temp_dic) :
temp_dic[i[col_index]].append(i[index])
else :
temp_dic[i[col_index]] = []
temp_dic[i[col_index]].append(i[index])
#evaluate the aggregate function value :
for i in temp_dic :
val = 0
if(aggr == 'min') :
val = min(temp_dic[i])
elif(aggr == 'max') :
val = max(temp_dic[i])
elif(aggr == 'count') :
val = len(temp_dic[i]) + 1
elif(aggr == 'sum') :
val = sum(temp_dic[i])
elif(aggr == 'average') :
val = sum(temp_dic[i]) / len(temp_dic[i])
if(i not in ans_dict) :
ans_dict[i] = []
ans_dict[i].append(val)
dic[key] = count
count += 1
dic[query_cols[0]] = 0
#store res in a list from ans_dict
for i in ans_dict :
temp = []
temp.append(i)
for j in ans_dict[i] :
temp.append(j)
temp_res.append(temp)
cartesian_product = temp_res
######## AGGREGATE FUNCTION ON A SINGLE COLUMN #######
if(aggregate_func) :
temp_list = []
col_index = dic[query_cols[0]]
val = ''
for i in cartesian_product :
temp_list.append(i[col_index])
if('min' in check_query):
val = min(temp_list)
elif('max' in check_query) :
val = max(temp_list)
elif('sum' in check_query):
val = sum(temp_list)
elif('average' in check_query) :
val = sum(temp_list) / (len(temp_list) +1)
elif('count' in check_query) :
val = len(temp_list) + 1
print(query_cols[0])
print(val)
sys.exit(0)
######## PROCESS SELECT COLUMNS : ########
index_list = []
res_list = []
#store the indices of columns need to be printed
for i in query_cols :
index_list.append(dic[i])
#store those columns in temporary list :
for i in cartesian_product :
n = len(i)
temp_list = []
for j in range(0,n) :
if(j in index_list) :
temp_list.append(str(i[j]))
res_list.append(temp_list)
###### PROCESS DISTINCT : #######
if(distinct) :
temp_dic = {}
temp_res = []
for i in res_list :
k = tuple(i)
if(k in temp_dic) :
continue
else :
temp_res.append(i)
temp_dic[k] = i
res_list = temp_res
##### PROCESS ORDER BY : #####
if(order_by) :
col = ''
col_index = ''
temp = (tokens[token_dic['orderby'] + 2])
col = str(temp[0])
order = str(temp[-1])
try:
col_index = dic[col]
except KeyError :
print("Column in order by need to be present in group by")
sys.exit(0)
#sort in ascending order
if(order.lower() == 'asc') :
cartesian_product.sort(key = lambda x: x[col_index])
#sort in descending order
else :
cartesian_product.sort(key = lambda x: x[col_index],reverse = True)
##### PRINT FINAL OUTPUT #######
#print names of columns :
print(','.join(query_cols))
#print comma seperated columns :
for i in res_list :
print(', '.join(i))
#Taking cross product of tables in the query
def crossProduct(query_tables,result_list,dic) :
files = []
finallist = []
temp = 0
for i in query_tables :
tab = i.split('.')
a = table_info[tab[0]]
for j in a :
for key in j :
dic[key] = temp
temp = temp + 1
#storing each file data in a 2D list :
'''for i in query_tables :
fileData = []
try :
with open(i+'.csv','r') as f:
reader = csv.reader(f)
for row in reader:
fileData.append(row)
files.append(fileData)
except IOError as error :
print(error)'''
for i in query_tables :
fileData = []
try :
with open(i+'.csv','r') as f:
reader = csv.reader(f)
for row in reader:
temp_list = row
for i in temp_list :
stripped = i.strip()
if(stripped.startswith('"') and stripped.endswith('"')) :
print("inside if")
temp_list[i] = int(temp_list[i])
fileData.append(temp_list)
files.append(fileData)
except IOError as error :
print(error)
#taking the product
finallist = list(product(*files))
#storing the product in a 2D list
for i in finallist :
temp = []
for j in i :
for k in j :
temp.append(int(k))
result_list.append(temp)
#Main function
if __name__ == '__main__' :
getMetaData()
query = sys.argv[1]
processQuery(query)