generated from amosproj/amos202Xss0Y-projname
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlandscape_explorer.py
187 lines (141 loc) · 6.67 KB
/
landscape_explorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
This script retrieves files with specified extensions from GitHub repositories,
augments a YAML file with download URLs, and caches requests for efficient retrieval.
It handles rate limits and logging errors during API requests.
Dependencies:
- requests
- os
- yaml
- tqdm
- requests_cache
- logging
- collections
Environment Variables:
- GITHUB_TOKEN: GitHub token for authentication (optional)
Usage:
Ensure correct configuration of 'BASE_REPO_YAML' and 'OUTPUT_PATH' for input and output file paths respectively. Adjust 'EXTENSIONS' for desired file types to retrieve.
Note:
Ensure 'repo_url' attributes in the YAML file correspond to valid GitHub repository URLs. Cached requests expire after 7 days ('landscape_cache').
"""
#!/usr/bin/python3
from yaml.representer import Representer
from collections import defaultdict
import requests
import os
import yaml
from tqdm import tqdm
import requests_cache
import logging
import time
import collections
TOKEN = os.getenv('GITHUB_TOKEN', "Replace your token")
HEADERS = {'Authorization': f'Bearer {TOKEN}',
'Accept': 'application/vnd.github+json', 'X-GitHub-Api-Version': '2022-11-28'}
BASE_API_URL = 'https://api.github.com'
BASE_REPO_YAML = 'https://raw.githubusercontent.com/cncf/landscape/master/landscape.yml'
EXTENSIONS = ["yml", "yaml", "pdf", "md"]
OUTPUT_PATH = '../../sources/landscape_augmented_repos.yml'
yaml.add_representer(collections.defaultdict, Representer.represent_dict)
# Cache requests for 7 days
requests_cache.install_cache('landscape_cache', expire_after=604800)
def get_urls(repo_url: str, default_branch: str = "", tree_sha: str = "", file_path: str = "", res: defaultdict = None) -> defaultdict:
"""
Retrieves the URLs of files with specific extensions from a GitHub repository.
Args:
repo_url (str): The URL of the GitHub repository.
default_branch (str, optional): The default branch of the repository. Defaults to "".
tree_sha (str, optional): The SHA of the tree object. Defaults to "".
file_path (str, optional): The path to a specific file or directory within the repository. Defaults to "".
res (defaultdict, optional): A defaultdict to store the URLs of files with specific extensions. Defaults to None.
Returns:
defaultdict: A defaultdict containing the URLs of files with specific extensions.
"""
if res is None:
res = defaultdict(list)
if not default_branch:
default_branch = get_default_branch(repo_url)
if not tree_sha:
tree_sha = default_branch
url = f'{BASE_API_URL}/repos/{repo_url.split("https://github.com/")[1]}/git/trees/{tree_sha}?recursive=1'
response = make_request(url).json()
truncated = False
if response.get('truncated'):
logging_path = file_path if file_path else "root"
logging.info(
f'request for files in path {logging_path} in repository: {repo_url} got truncated because of file number limit')
truncated = True
url = f'{BASE_API_URL}/repos/{repo_url.split("https://github.com/")[1]}/git/trees/{tree_sha}'
response = make_request(url).json()
tree = response.get('tree')
if not tree:
return res
base_download_url = f'https://raw.githubusercontent.com/{repo_url.split("https://github.com/")[1]}/{default_branch}/'
for file in tree:
ext = file.get('path').split('.')[-1]
new_file_path = f"{file_path}/{file['path']}" if file_path else file['path']
if file.get('type') == 'blob' and ext in EXTENSIONS:
res[ext].append(base_download_url + new_file_path)
if truncated and file.get('type') == 'tree':
logging.debug(f'Recursively fetching URLs for path {new_file_path}')
get_urls(repo_url, default_branch, file.get('sha'), new_file_path, res)
return res
def get_default_branch(repo_url: str) -> str:
"""
Retrieves the default branch of a GitHub repository.
Args:
repo_url (str): The URL of the GitHub repository.
Returns:
str: The name of the default branch.
Raises:
requests.exceptions.RequestException: If there is an error making the HTTP request.
"""
url = f'{BASE_API_URL}/repos/{repo_url.split("https://github.com/")[1]}'
response = make_request(url)
return response.json().get('default_branch')
def generate_augmented_yml_with_urls() -> None:
"""
Retrieves the YAML content from BASE_REPO_YAML, augments it with download URLs,
and saves the augmented content to 'sources/landscape_augmented.yml'.
Returns:
None
"""
response = make_request(BASE_REPO_YAML)
content = response.content.decode('utf-8')
content = yaml.safe_load(content) # type dict
os.makedirs('../../sources', exist_ok=True)
for category in tqdm(content.get('landscape'), desc="categories"):
for subcategory in tqdm(category.get('subcategories'), desc="subcategories"):
for item in tqdm(subcategory.get('items'), desc="sources"):
if 'repo_url' not in item or not item.get('repo_url'):
continue
urls = get_urls(item.get('repo_url'))
item['repo'] = defaultdict(defaultdict)
for ext, url_list in urls.items():
item['repo']['download_urls'][ext] = url_list
with open(OUTPUT_PATH, 'w+') as file:
yaml.dump(content, file, sort_keys=False)
def make_request(url):
"""
Makes an HTTP GET request to the provided URL with error handling and rate limit handling.
Args:
url (str): The URL to make the request to.
Returns:
requests.Response or None: The response object if the request was successful, None otherwise.
"""
print("making request to url: ", url)
try:
response = requests.get(url, headers=HEADERS, timeout=30)
except requests.exceptions.RequestException as e:
logging.error(f'Error making request to {url}: {e}')
return None
if 'retry_after' in response.headers:
logging.warning(
f'Rate limit exceeded. Retrying after {response.headers["retry-after"]} seconds')
time.sleep(int(response.headers['retry-after']))
elif 'x-ratelimit-remaining' in response.headers and int(response.headers['x-ratelimit-remaining']) == 0:
logging.warning(
f'Rate limit exceeded. Retrying after {response.headers["x-ratelimit-remaining"]} seconds')
time.sleep(int(response.headers['x-ratelimit-reset']))
return response
if __name__ == '__main__':
generate_augmented_yml_with_urls()