-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGitHubConfig.py
124 lines (111 loc) · 5.91 KB
/
GitHubConfig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
try:
from urlparse import urlparse, parse_qs
except ImportError:
from urllib.parse import urlparse, parse_qs
import json, shelve, re, os
from Crawler4py.Config import Config
class GitHubConfig(Config):
def __init__(self):
Config.__init__(self)
self.UserAgentString = "lordnahor-libseek-MSR-app"
self.MaxWorkerThreads = 8
self.DepthFirstTraversal = True
self.FrontierTimeOut = 100
self.WorkerTimeOut = 100
self.OutBufferTimeOut = 100
self.PolitenessDelay = 1000
self.MaxPageSize = 1048576*5
self.IgnoreRobotRule = True
self.urlToNameMap = shelve.open("urlDataPersist.shelve")
def GetSeeds(self):
'''Returns the first set of urls to start crawling from'''
seeds = []
repoMetaData = json.load(open("../bucket2.json", "r"))
try:
os.mkdir("repoData")
except OSError:
pass
for repo in repoMetaData:
name = repo["full_name"].replace("/", "-")
try:
os.mkdir("repoData/" + name)
except OSError:
pass
metaFile = open("repoData/" + name + "/metadata.json", "w")
json.dump(repo, metaFile, sort_keys=True, indent=4, separators=(',', ': '))
metaFile.close()
seeds.append(repo["contents_url"][:-7])
return seeds
def HandleData(self, parsedData):
'''Function to handle url data. Guaranteed to be Thread safe.
parsedData = {"url" : "url", "text" : "text data from html", "html" : "raw html data"}
Advisable to make this function light. Data can be massaged later. Storing data probably is more important'''
downloadData = json.loads(parsedData["html"])
if type(downloadData) is type({}) and "content" in downloadData:
foldername = "-".join(re.match(".*repos/(.*)/(.*)/git.*", parsedData["url"]).groups())
if parsedData["url"].encode("utf-8") not in self.urlToNameMap:
filename = parsedData["url"].encode("utf-8")
else:
filename = self.urlToNameMap[parsedData["url"].encode("utf-8")]
if "readme" in filename.lower():
file = open("repoData/" + foldername + "/description.txt", "a")
else:
file = open("repoData/" + foldername + "/allPythonContent.py", "a")
try:
file.write("__FILENAME__ = " + (filename) + "\n" + downloadData["content"] + "\n########NEW FILE########\n")
except UnicodeError:
print "Error at ", filename, parsedData["url"]
file.close()
print ("Wrote data to " + foldername + " File: " + filename)
if parsedData["url"].encode("utf-8") in self.urlToNameMap:
del self.urlToNameMap[parsedData["url"].encode("utf-8")]
self.urlToNameMap.sync()
pass
def AllowedSchemes(self, scheme):
'''Function that allows the schemes/protocols in the set.'''
return scheme.lower() in set(["http", "https", "ftp"])
def ValidUrl(self, url):
'''Function to determine if the url is a valid url that should be fetched or not.'''
return True
def GetTextData(self, htmlData):
'''Function to clean up html raw data and get the text from it. Keep it small.
Not thread safe, returns an object that will go into the parsedData["text"] field for HandleData function above'''
return ""
def ExtractNextLinks(self, url, rawData, outputLinks):
'''Function to extract the next links to iterate over. No need to validate the links. They get validated at the ValudUrl function when added to the frontier
Add the output links to the outputLinks parameter (has to be a list). Return Bool signifying success of extracting the links.
rawData for url will not be stored if this function returns False. If there are no links but the rawData is still valid and has to be saved return True
Keep this default implementation if you need all the html links from rawData'''
downloadData = json.loads(rawData)
if type(downloadData) is list:
# First download file
for file in downloadData:
if file["type"] == "file":
if file["name"].endswith(".py") or "readme" in file["name"].lower():
if "git_url" in file:
outputLinks.append(file["git_url"])
self.urlToNameMap[file["git_url"].encode("utf-8")] = file["name"]
self.urlToNameMap.sync()
elif file["type"] == "dir":
outputLinks.append(file["git_url"])
else:
# One of the recursive download files.
if "tree" in downloadData:
# is a folder. go recursive
for file in downloadData["tree"]:
if file["type"] != "blob" or file["path"].endswith(".py") or "readme" in file["path"].lower():
if "url" in file:
outputLinks.append(file["url"])
if file["type"] == "blob":
self.urlToNameMap[file["url"].encode("utf-8")] = file["path"]
self.urlToNameMap.sync()
# if a blob file, dont bother with doing anything. can be processed right away.
return True
def GetAuthenticationData(self):
''' Function that returns dict(top_level_url : tuple(username, password)) for basic authentication purposes'''
credsFile = open("../github-creds.json")
creds = json.load(credsFile)
username = creds["username"]
password = creds["password"]
credsFile.close()
return {"api.github.com" : (username, password)}