-
Notifications
You must be signed in to change notification settings - Fork 63
/
Copy pathsurfer.go
108 lines (102 loc) · 3.07 KB
/
surfer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// Copyright 2015 andeya Author. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Package surfer is a high level concurrency http client.
//
// It has `surf` and` phantom` download engines, highly simulated browser behavior, the function of analog login and so on.
//
// Features:
// - Both surf and phantomjs engines are supported
// - Support random User-Agent
// - Support cache cookie
// - Support http/https
//
// Usage:
// package main
//
// import (
// "github.com/andeya/surfer"
// "io/ioutil"
// "log"
// )
//
// func main() {
// // Use surf engine
// resp, err := surfer.Download(&surfer.Request{
// Url: "http://github.com/andeya/surfer",
// })
// if err != nil {
// log.Fatal(err)
// }
// b, err := ioutil.ReadAll(resp.Body)
// log.Println(string(b), err)
//
// // Use phantomjs engine
// resp, err = surfer.Download(&surfer.Request{
// Url: "http://github.com/andeya",
// DownloaderID: 1,
// })
// if err != nil {
// log.Fatal(err)
// }
// b, err = ioutil.ReadAll(resp.Body)
// log.Println(string(b), err)
// resp.Body.Close()
// surfer.DestroyJsFiles()
// }
package surfer
import (
"net/http"
"net/http/cookiejar"
"sync"
)
var (
surf Surfer
phantom Surfer
once_surf sync.Once
once_phantom sync.Once
tempJsDir = "./tmp"
phantomjsFile = "./phantomjs"
cookieJar, _ = cookiejar.New(nil)
)
// Download 实现surfer下载器接口
func Download(req *Request) (resp *http.Response, err error) {
switch req.DownloaderID {
case SurfID:
once_surf.Do(func() { surf = New(cookieJar) })
resp, err = surf.Download(req)
case PhomtomJsID:
once_phantom.Do(func() { phantom = NewPhantom(phantomjsFile, tempJsDir, cookieJar) })
resp, err = phantom.Download(req)
}
return
}
// 指定phantomjs可执行文件的位置
func SetPhantomJsFilePath(filePath string) {
phantomjsFile = filePath
}
// DestroyJsFiles 销毁Phantomjs的js临时文件
func DestroyJsFiles() {
if pt, ok := phantom.(*Phantom); ok {
pt.DestroyJsFiles()
}
}
// Surfer represents an core of HTTP web browser for crawler.
type Surfer interface {
// GET @param url string, header http.Header, cookies []*http.Cookie
// HEAD @param url string, header http.Header, cookies []*http.Cookie
// POST PostForm @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
// POST-M PostMultipart @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
Download(*Request) (resp *http.Response, err error)
}