-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiff_extend.py
85 lines (69 loc) · 1.67 KB
/
diff_extend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import click
from utils import load, dump, cloned
from github_fetch import fetch_data
import requests
import re
from tqdm import tqdm
diff_url = lambda user, repo, pnr: f'https://github.com/{user}/{repo}/pull/{pnr}.diff'
pattern = re.compile('a\/.*\.([a-z]*)$')
include = (
# C, C++
'c', 'cpp', 'cc', 'h', 'o',
# C#
'cs',
# Python
'py',
# Java,
'java',
# Js/Ts
'js', 'ts',
# PHP
'phtml', 'php', 'php3', 'php4', 'php5', 'phps', 'phpt',
# Ruby,
'rb',
# Go
'go',
# Swift
'swift',
# Web
'html', 'htm', 'xml', 'css', 'scss', 'sass', 'xlf', 'less',
# General files
'json', 'yaml', 'md', 'rst', 'txt', 'yml', 'sh', 'sql'
)
def get_diffs(rid, pr_num):
user, repo = rid.split('/')
url = diff_url(user, repo, pr_num)
res = requests.get(url)
if res.status_code == 200:
diffs = []
# Each file is separated by diff --git
split = res.text.split('diff --git ')[1:]
# For every file
for entry in split:
file = entry.split('\n')[0].split(' ')[0]
# Throws error if no match (no extension)
try:
filetype = pattern.match(file).group(1)
except:
filetype = ''
# If filetype in include, append diff
if filetype in include:
diffs.append(entry)
return diffs
else:
raise ConnectionError('Diff does not exist')
@click.command()
@click.option('--file', '-f', help='Dataset file')
def main(file):
data = load(file)
extended = {}
for (rid, num), value in tqdm(data.items()):
try:
diffs = get_diffs(rid, num)
# Insert this pr with diffs to extended dataset
extended[(id, num)] = {**value, 'diffs': diffs}
except ConnectionError:
print('Diff fetch failed, skipping')
dump(extended, 'out.pickle')
if __name__ == '__main__':
main()