-
Notifications
You must be signed in to change notification settings - Fork 257
/
Copy pathgenerate_markdown.py
268 lines (239 loc) · 12.3 KB
/
generate_markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
from collections import Counter, defaultdict
from datetime import datetime
from json import load
from os.path import join
from pathlib import Path
from urllib.request import urlopen
from cv2 import IMREAD_GRAYSCALE, THRESH_BINARY, imdecode, threshold
from google.cloud import bigquery
from matplotlib import font_manager
from numpy import any, asarray, mean, median
from pypistats import overall, recent
from tqdm import tqdm
from wordcloud import WordCloud
BADGES = set(image.stem for image in Path('images').glob('*.svg'))
TOP_K = 20
def colab_url(url: str) -> str:
return f'[]({url})'
def doi_url(url: str) -> str:
doi = url.split('org/')[1]
return f'[]({url})'
def git_url(url: str) -> str:
repo = '/'.join(url.split('com/')[1].split('/')[:2])
return f'[]({url})'
def pypi_url(package: str, period='dm') -> str:
return f'[](https://pypi.org/{package}/)'
def read_json(filepath: str):
with open(filepath, 'r', encoding='utf-8') as f:
return load(f)
def load_projects():
for entity in ['cources', 'research', 'tutorials']:
yield from read_json(join('data', f'{entity}.json'))
def get_git_name_stars(git_url: tuple[str, str, int]) -> tuple[str, int]:
_, url, stars = git_url
idx = url.index('/', 19) + 1
idx = url.find('/', idx)
name = url[:idx] if idx != -1 else url
return (name, stars)
def parse_link(link_tuple: tuple[str, str], height=20) -> str:
name, url = link_tuple
if name in BADGES:
return f'[<img src="images/{name}.svg" alt="{name}" height={height}/>]({url})'
return f'[{name}]({url})'
def parse_authors(authors: list[tuple[str, str]], num_of_visible: int) -> str:
if len(authors) == 1:
return '[{}]({})'.format(*authors[0])
if len(authors) <= num_of_visible + 1:
return '<ul>' + ' '.join(f'<li>[{author}]({link})</li>' for author,link in authors[:num_of_visible + 1]) + '</ul>'
return '<ul>' + ' '.join(f'<li>[{author}]({link})</li>' for author,link in authors[:num_of_visible]) + '<details><summary>others</summary>' + ' '.join(f'<li>[{author}]({link})</li>' for author,link in authors[num_of_visible:]) + '</ul></details>'
def parse_links(list_of_links: list[tuple[str, str]]) -> str:
if len(list_of_links) == 0:
return ''
dct = defaultdict(list)
for name_url in list_of_links:
name, url = name_url[0], name_url[1]
dct[name].append(url)
line = ''
if 'doi' in dct:
line += doi_url(dct['doi'][0]) + ' '
dct.pop('doi')
if 'git' in dct:
line += git_url(dct['git'][0]) + ' '
if len(dct['git']) == 1:
dct.pop('git')
else:
dct['git'].pop(0)
if len(dct) == 0:
return line
return line + '<ul>' + ''.join('<li>' + ', '.join(parse_link((name, url)) for url in dct[name]) + '</li>' for name in dct.keys()) + '</ul>'
def get_top_authors(topK) -> tuple[str, int]:
global TOP_K
authors, num_of_authors = [], []
for project in load_projects():
authors.extend([tuple(author) for author in project['author']])
num_of_authors.append(len(project['author']))
cnt = Counter(authors)
most_common = cnt.most_common()
contributions = most_common[topK][1]
idx = topK
while idx < len(most_common) and most_common[idx][1] == contributions:
idx += 1
num_of_visible = int(min(mean(num_of_authors), median(num_of_authors)))
TOP_K = idx
return '<ul>' + ' '.join(f'<li>[{author}]({link})</li>' for (author,link),_ in most_common[:idx]) + '</ul>', num_of_visible
def get_top_repos(topK) -> str:
repos = {}
for project in load_projects():
for link in project['links']:
if link[0] == 'git':
name, stars = get_git_name_stars(link)
repos[name] = stars
break
repos = sorted(repos.items(), key=lambda f: f[1], reverse=True)[:topK]
return '<ul>' + ' '.join(f"<li>{url.split('com/')[1].split('/')[1]}\t{git_url(url)}</li>" for url,_ in repos) + '</ul>'
def generate_cloud():
def get_font_path(font_name='Comic Sans MS'):
for path in font_manager.findSystemFonts(fontext='ttf'):
font = font_manager.FontProperties(fname=path)
if font_name in font.get_name():
return path
with urlopen('https://img.icons8.com/color/480/google-colab.png') as resp:
image = asarray(bytearray(resp.read()), dtype="uint8")
_, bw_img = threshold(imdecode(image, IMREAD_GRAYSCALE), 127, 255, THRESH_BINARY)
mask = bw_img[any(bw_img, axis=1)]
mask = mask[:, any(mask, axis=0)]
text = ' '.join(' '.join([p['name'], p['description']]) for p in load_projects())
wc = WordCloud(mask=~mask, collocation_threshold=10, colormap='plasma', font_path=get_font_path())
wc.generate(text)
with open(join('images', 'cloud.svg'), 'w') as f:
f.write(wc.to_svg())
def get_top_papers(topK) -> str:
repos = {}
for project in load_projects():
for link in project['links']:
if link[0] == 'doi':
if link[1] not in repos or link[2] > repos[link[1]][1]:
repos[link[1]] = (project['name'], link[2])
break
repos = sorted([(name, url, citations) for url, (name, citations) in repos.items()], key=lambda f: f[2], reverse=True)[:topK]
return '<ul>' + ' '.join(f"<li>{name}\t{doi_url(url)}</li>" for name,url,_ in repos) + '</ul>'
def get_best_of_the_best(authors: str, packages, topK: int) -> str:
packages_str = '<ul>' + ' '.join(f'<li>{package}\t{pypi_url(package)}</li>' for package,_,_ in sorted(packages, key=lambda p: p[2], reverse=True)[:topK]) + '</ul>'
table = f'''| authors | repositories | papers | packages |
|---|---|---|---|
| {authors} | {get_top_repos(topK)} | {get_top_papers(topK)} | {packages_str} |'''
return table
def generate_table(fn: str, num_visible_authors: int):
data = read_json(fn)
colabs = sorted(data, key=lambda kv: kv['update'], reverse=True)
to_write = [
'| name | description | authors | links | colaboratory | update |',
'|------|-------------|:--------|:------|:------------:|:------:|',
]
for line in colabs:
line['author'] = parse_authors(line['author'], num_visible_authors)
line['links'] = parse_links(sorted(line['links'], key=lambda x: x[0]))
line['url'] = colab_url(line['colab'])
line['update'] = datetime.fromtimestamp(line['update']).strftime('%d.%m.%Y')
to_write.append('| {name} | {description} | {author} | {links} | {url} | {update} |'.format(**line))
return to_write
def get_pypi_downloads(engine: str = 'pypistats'):
packages = set()
for project in load_projects():
for url in project['links']:
if url[0] == 'pypi':
packages.add(url[1].rstrip('/').split('/')[-1])
if engine == 'bigquery':
def get_query(date_filtering: str) -> str:
return f"""
SELECT
file.project,
COUNT(*) AS num_downloads
FROM
`bigquery-public-data.pypi.file_downloads`
WHERE
file.project IN ('{"', '".join(packages)}')
AND DATE(timestamp) BETWEEN {date_filtering}
GROUP BY
file.project
"""
client = bigquery.Client()
last_month = 'DATE_SUB(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH) AND DATE_SUB(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 DAY)'
total = 'DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AND CURRENT_DATE()'
query_last_month = client.query(get_query(last_month))
query_total = client.query(get_query(total))
# query_job = client.query(f"""
# SELECT
# file.project,
# COUNTIF(DATE(timestamp) BETWEEN DATE_SUB(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH)
# AND DATE_SUB(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 DAY)) AS num_downloads_last_month,
# COUNT(*) AS total_num_downloads
# FROM
# `bigquery-public-data.pypi.file_downloads`
# WHERE
# file.project IN ('{"', '".join(packages)}')
# GROUP BY
# file.project
# """)
res_last_month = {row.project: row.num_downloads for row in query_last_month.result()}
res_total = {row.project: row.num_downloads for row in query_total.result()}
return [(package, res_last_month[package], res_total[package]) for package in packages]
return [(package, int(recent(package, format='pandas').last_month), int(overall(package, format='pandas').query('category == "Total"').downloads)) for package in tqdm(packages)]
def get_trending(packages, topK: int):
old_stars = read_json('data/stars.json')
old_citations = read_json('data/citations.json')
new_stars, new_citations = {}, {}
for project in load_projects():
used = set()
for link in project['links']:
if link[0] == 'git' and 'git' not in used:
name, stars = get_git_name_stars(link)
new_stars[name] = stars
used.add('git')
elif link[0] == 'doi' and 'doi' not in used:
_, url, citations = link
new_citations[project['name']] = (url, citations)
used.add('doi')
trending_repos = sorted(new_stars, key=lambda url: new_stars[url] / old_stars.get(url, float('inf')), reverse=True)[:topK]
trending_papers = sorted(new_citations, key=lambda name: new_citations[name][1] / max(old_citations.get(name, ['', float('inf')])[1], 1), reverse=True)[:topK]
trending_packages = sorted(packages, key=lambda p: p[1]/ (p[2] - p[1]), reverse=True)[:topK]
repos_str = '<ul>' + ' '.join(f"<li>{url.split('com/')[1].split('/')[1]}\t{git_url(url)}</li>" for url in trending_repos) + '</ul>'
papers_str = '<ul>' + ' '.join(f"<li>{name}\t{doi_url(new_citations[name][0])}</li>" for name in trending_papers) + '</ul>'
packages_str = '<ul>' + ' '.join(f'<li>{package}\t{pypi_url(package, period="dw")}</li>' for package,_,_ in trending_packages) + '</ul>'
return f'''| repositories | papers | packages |
|---|---|---|
| {repos_str} | {papers_str} | {packages_str} |'''
def generate_markdown():
top_authors, num_visible_authors = get_top_authors(TOP_K)
packages = get_pypi_downloads()
to_write = [
'[](https://hits.seeyoufarm.com)',
'\n',
'[](images/cloud.svg)',
'\nThe page might not be rendered properly. Please open [README.md](https://github.com/amrzv/awesome-colab-notebooks/blob/main/README.md) file directly',
'# Awesome colab notebooks collection for ML experiments',
'## Trending',
get_trending(packages, TOP_K),
'## Cources',
'<details>\n<summary>COURCES</summary>\n',
*generate_table(join('data', 'cources.json'), num_visible_authors),
'\n</details>\n',
'## Research',
'<details>\n<summary>RESEARCH</summary>\n',
*generate_table(join('data', 'research.json'), num_visible_authors),
'\n</details>\n',
'## Tutorials',
'<details>\n<summary>TUTORIALS</summary>\n',
*generate_table(join('data', 'tutorials.json'), num_visible_authors),
'\n</details>\n',
'# Best of the best',
get_best_of_the_best(top_authors, packages, TOP_K),
'\n[](https://starchart.cc/amrzv/awesome-colab-notebooks)',
'\n(generated by [generate_markdown.py](generate_markdown.py) based on [research.json](data/research.json), [tutorials.json](data/tutorials.json), [cources.json](data/cources.json))'
]
with open('README.md', 'w', encoding='utf-8') as f:
f.write('\n'.join(to_write))
def main():
generate_markdown()
if __name__ == '__main__':
main()