-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
405 lines (338 loc) · 8.45 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
/*
Crawl a host and save all relevant pages to local storage
*/
package main
import (
"flag"
"fmt"
"os"
"log"
"encoding/hex"
"crypto/rand"
"net/url"
"errors"
"sync"
"strings"
"io/ioutil"
"io"
"net/http"
"bytes"
"golang.org/x/net/html"
"time"
"strconv"
)
// concurrentStorage acts as a set. A common storage point for multiple go routines and
// as a validator, to avoid processing urls that have already been processed by other routines.
type concurrentStorage struct {
sync.Mutex
domain string
urls map[url.URL]bool
urlsSize int
}
func newConcurrentStorage(d string) *concurrentStorage{
return &concurrentStorage{
domain: d,
urls: map[url.URL]bool{},
}
}
// Return true if the URL is unseen and was saved.
//
// add saves a URL iff it hasn't been processed by a go routine. If it
// cannot save it, then returns an empty URL and false to let the caller
// know not to process it.
func (c *concurrentStorage) add(u url.URL) (bool) {
c.Lock()
defer c.Unlock()
if _, ok := c.urls[u]; ok{
return false
}
c.urls[u] = true
c.urlsSize++
return true
}
func (c *concurrentStorage) size() int {
c.Lock()
defer c.Unlock()
return c.urlsSize
}
const (
ERROR = iota // 0
WARNING // 1
INFO // 2
DEBUG // 3
VERBOSE // 4
)
var (
runtimeLog *os.File
logger *log.Logger
logLevel = ERROR
)
func randomHex(n int) (string, error) {
bytes := make([]byte, n)
if _, err := rand.Read(bytes); err != nil {
return "", err
}
return hex.EncodeToString(bytes), nil
}
func initLogger(ll int){
logLevel = ll
runtimeLog, err := os.OpenFile("/var/log/scrapefreeproxylist.log",
os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Printf("error opening file: %v", err)
os.Exit(1)
}
hex, _ := randomHex(10)
prefix := hex + "-applog:"
logger = log.New(runtimeLog, prefix, log.Lshortfile|log.LstdFlags)
logger.Println("---------- START " + prefix + "----------")
fmt.Println("---------- START " + prefix + "----------")
}
func logError(msg string){
cMsg := "ERROR: " + msg
logger.Println(cMsg)
fmt.Println(cMsg)
}
func logWarning(msg string){
if(logLevel >= WARNING){
cMsg := "WARNING: " + msg
logger.Println(cMsg)
fmt.Println(cMsg)
}
}
func logInfo(msg string){
if(logLevel >= INFO){
logger.Println("INFO: " + msg)
fmt.Println("INFO: " + msg)
}
}
func logDebug(msg string){
if(logLevel >= DEBUG){
logger.Println("DEBUG: " + msg)
fmt.Println("DEBUG: " + msg)
}
}
func logVerbose(msg string){
if(logLevel == VERBOSE){
logger.Println("VERBOSE: " + msg)
fmt.Println("VERBOSE: " + msg)
}
}
var (
domain string
timeout int
pageLimit int
pageCounter int
)
func validateUrl(u url.URL) error {
if u.Host == "" {
return errors.New("Try the format https://www.example.com. No host found in " +
domain)
}
return nil
}
// Get the contents of a web page
// Return error if the request fails
func getHttp(url url.URL) (io.ReadCloser, error) {
resp, err := http.Get(url.String())
if err != nil {
log.Printf("HTTP failed to GET url=%s. error=%s\n", url.String(), err)
return nil, err
}
return resp.Body, nil
}
// Extract the href attribute from a Token
func getHref(t html.Token) (ok bool, href string) {
for _, a := range t.Attr {
if a.Key == "href" {
href = a.Val
ok = true
}
}
return
}
// adds missing pieces to a URL and then validates it.
// if is an invalid/non-accessible URL then return false
func sanitizeUrl(href string, domain string) (url.URL, bool){
if strings.Trim(href, " ") == ""{
return url.URL{}, false
}
u, err := url.Parse(href)
if err != nil {
log.Println(err)
return url.URL{}, false
}
if u.Host == ""{
u.Host = domain
} else if u.Host != domain || u.Path == "/" || u.Path == ""{
return url.URL{}, false
}
if u.Scheme == ""{
u.Scheme = "https"
}
// Ignore alien schemas [ mailto, ftp, etc ]
if !strings.Contains(u.Scheme, "http") {
return url.URL{}, false
}
// TODO: Check URL is accessible
return *u, true
}
// Get only urls of the specified domain given the body of a web page
func getUrls(body []byte, domain string) ([]url.URL, error) {
// holds only valid urls
var urls []url.URL
reader := bytes.NewReader(body)
tokenizer := html.NewTokenizer(reader)
infinitefor:for {
tokenType := tokenizer.Next()
switch {
case tokenType == html.ErrorToken:
// End of the document, we're done
break infinitefor
case tokenType == html.StartTagToken:
t := tokenizer.Token()
// Check if the token is an <a> tag
isAnchor := t.Data == "a"
if !isAnchor {
continue
}
// Extract the href value, if there is one
ok, href := getHref(t)
if !ok {
continue
}
if url, ok := sanitizeUrl(href, domain); ok {
urls = append(urls, url)
}
}
}
return urls, nil
}
// Save the page contents (converted to a byte array) to a file in local storage
// Returns whether the page was saved successfully
func savePage(url url.URL, body []byte) bool{
// TODO: Take save location as a CMD line flag
rootDir := "/tmp/scraper"
dirPath := rootDir + "/" + url.Host + url.Path
err := os.MkdirAll(dirPath, 0777)
if err != nil {
log.Printf("Cannot create directory %s. \nError: %s", dirPath, err)
return false
}
filePath := dirPath + "/index.html"
err = ioutil.WriteFile(filePath, body, 0777)
if err != nil {
log.Printf("Cannot write to file=%s. \nError: %s", filePath, err)
return false
}
return true
}
// scrape visits a page and extracts all the valid urls for the given domain
// Returns error if the target URL is empty, cannot be scrapped by access over HTTP,
// urls cannot be scraped.
func scrape(u url.URL) ([]url.URL, error) {
if strings.Trim(u.String(), " ") == ""{
return []url.URL{}, errors.New("empty url")
}
pageReadCloser, err := getHttp(u)
defer pageReadCloser.Close()
if err != nil {
log.Printf("failed to get pageReadCloser at u=%s. err=%s\n", u, err)
return []url.URL{}, nil
}
page, err := ioutil.ReadAll(pageReadCloser)
if err != nil {
log.Printf("Could not read page buffer for url=%s\n", u.String())
return []url.URL{}, err
}
if savePage(u, page) {
pageCounter++
}
if pageLimit != -1 && pageCounter >= pageLimit {
logInfo("Reached page download limit=" + strconv.Itoa(pageLimit))
os.Exit(0)
}
urls, err := getUrls(page, u.Host)
if err != nil {
log.Printf("failed to extract valid urls for pageReadCloser at u=%s. err=%s\n", u, err)
return []url.URL{}, err
}
return urls, nil
}
// crawl could be called multiple times in parallel to increase productivity.
func crawl(urlSet *concurrentStorage, ch chan url.URL){
for {
select {
case u := <- ch:
if ok := urlSet.add(u); ok {
log.Printf("Received url=%s", u.String())
urls, err := scrape(u)
if err != nil {
log.Printf("Could not scrape url=%s.\nError: %s", u.String(), err)
break
}
for _, url := range urls {
go func() {ch <- url}()
}
}
}
}
}
// todo: unittest
func validateFlags(d string, t int, p int) error{
if d == "" {
eMsg := "-host needs to be set"
//logError(eMsg)
return errors.New(eMsg)
}
if t == -1 && p == -1 {
eMsg := "-timeout or -pages needs to be set"
//logError(eMsg)
return errors.New(eMsg)
}
// todo: validate flags
return nil
}
func main() {
initLogger(VERBOSE)
pageCounter = 0
flag.StringVar(&domain, "host", "", "The url to scrape.")
flag.IntVar(&timeout, "timeout", -1, "Lifetime of this " +
"process (in seconds). If not set will run indefinitely until another " +
"constraint is met (page limit). At least one constraint needs to be " +
"set.")
flag.IntVar(&pageLimit, "pages", -1, "Limit of pages to" +
" visit. If not set will run until the timeout constraint is met. At " +
"least one constraint needs to be set.")
flag.Parse()
err := validateFlags(domain, timeout, pageLimit)
if err != nil {
logError("Invalid flags. Err: " + err.Error())
os.Exit(1)
}
targetURL, err:= url.Parse(domain)
if err != nil {
logError("Could not parse target url: " + domain)
logError("Err: " + err.Error())
os.Exit(1)
}
err = validateUrl(*targetURL)
if err != nil {
logError("Failed to parse url. Err: " + err.Error())
os.Exit(1)
}
// TODO: write function to find a valid schema by requesting with multiple versions of the url
if targetURL.Scheme == "" {
targetURL.Scheme = "https"
}
urlSet := newConcurrentStorage(targetURL.Host)
urlCh := make(chan url.URL, 2)
go crawl(urlSet, urlCh)
go crawl(urlSet, urlCh)
urlCh <- *targetURL
if timeout != -1 {
time.Sleep(time.Duration(timeout) * time.Second)
} else {
time.Sleep(time.Duration(1) * time.Hour) // Max time
}
}