-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshorten_diffs.py
74 lines (52 loc) · 1.67 KB
/
shorten_diffs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from utils import load, dump
import numpy as np
import statistics
import re
plock = (
'json', 'yaml', 'txt', 'yml'
)
def get_type(path):
pattern = re.compile('a\/.*\.([a-z]*)$')
try:
filetype = pattern.match(path).group(1)
except:
filetype = ''
return filetype
def quantiles(values, q=90):
num_quants = 100//(100-q)
quant = statistics.quantiles(values, n=num_quants)[-1]
return quant
if __name__ == "__main__":
data = load('data/big_data/scraped_diffs.pickle')
lengths = []
for pr in data.values():
for diff in pr['diffs']:
# path = diff.split('\n')[0].split(' ')[0]
# file = path.split('/')[-1]
# if 'lock' in file:
# print(file)
lengths.append(len(diff))
print(f'Mean: {np.mean(lengths)}')
print(f'Q75: {quantiles(lengths, q=75)}')
print(f'Q80: {quantiles(lengths, q=80)}')
print(f'Q85: {quantiles(lengths, q=85)}')
print(f'Q90: {quantiles(lengths, q=90)}')
quan = quantiles(lengths, q=95)
print(f'Q95: {quantiles(lengths, q=95)}')
toolong = []
for (rid, num), pr in data.items():
for diff in pr['diffs']:
path = diff.split('\n')[0].split(' ')[0]
file = path.split('/')[-1]
exte = get_type(path)
if len(diff) > quan and exte in plock:
print(path, exte)
toolong.append(num)
remove = set(toolong)
newset = {}
for (rid, num), value in data.items():
if num not in remove:
newset[(rid, num)] = value
print(len(data))
print(len(newset))
dump(newset, 'data/big_data/scraped_diffs_q95.pickle')