-
Notifications
You must be signed in to change notification settings - Fork 470
/
Copy pathmain.go
131 lines (114 loc) · 4.32 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
package main
import (
"github.com/PuerkitoBio/goquery"
"github.com/hu17889/go_spider/core/common/page"
"github.com/hu17889/go_spider/core/common/request"
"github.com/hu17889/go_spider/core/pipeline"
"github.com/hu17889/go_spider/core/spider"
"net/http"
"net/url"
"strings"
"fmt"
"errors"
)
type MyPageProcesser struct {
cookies []*http.Cookie
}
func NewMyPageProcesser() *MyPageProcesser {
return &MyPageProcesser{}
}
// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
if p.GetUrlTag() == "site_login" {
//fmt.Printf("%v\n", p.GetCookies())
this.cookies = p.GetCookies()
// AddTargetRequestWithParams Params:
// 1. Url.
// 2. Responce type is "html" or "json" or "jsonp" or "text".
// 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
// 4. The method is POST or GET.
// 5. The postdata is body string sent to sever.
// 6. The header is header for http request.
// 7. Cookies
// 8. Http redirect function
if len(this.cookies) != 0 {
p.AddField("info", "get cookies success")
req := request.NewRequest("http://backadmin.hucong.net/site/index", "html", "site_index", "GET", "", nil, this.cookies, nil, nil)
p.AddTargetRequestWithParams(req)
} else {
p.AddField("info", "get cookies failed")
}
} else {
//fmt.Printf("%v\n", p.GetBodyStr())
query := p.GetHtmlParser()
pageTitle := query.Find(".page-content .page-title").Text()
if len(pageTitle) != 0 {
p.AddField("page_title", pageTitle)
p.AddField("info", "login success")
} else {
p.AddField("info", "login failed")
}
}
return
if !p.IsSucc() {
println(p.Errormsg())
return
}
query := p.GetHtmlParser()
var urls []string
query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
urls = append(urls, "http://github.com/"+href)
})
// these urls will be saved and crawed by other coroutines.
p.AddTargetRequests(urls, "html")
name := query.Find(".entry-title .author").Text()
name = strings.Trim(name, " \t\n")
repository := query.Find(".entry-title .js-current-repository").Text()
repository = strings.Trim(repository, " \t\n")
//readme, _ := query.Find("#readme").Html()
if name == "" {
p.SetSkip(true)
}
// the entity we want to save by Pipeline
p.AddField("author", name)
p.AddField("project", repository)
//p.AddField("readme", readme)
}
func (this *MyPageProcesser) Finish() {
fmt.Printf("TODO:before end spider \r\n")
}
// function that prevent redirect for getting cookies
// If CheckRedirect function returns error.New("normal"), the error process after client.Do will ignore the error.
func myRedirect(req *http.Request, via []*http.Request) error {
return errors.New("normal")
}
func main() {
// POST data
post_arg := url.Values{
"name": {"admin"},
"pwd": {"admin"},
}
// http header
header := make(http.Header)
header.Set("Content-Type", "application/x-www-form-urlencoded")
// Spider input:
// PageProcesser ;
// Task name used in Pipeline for record;
// AddRequest Params:
// 1. Url.
// 2. Responce type is "html" or "json" or "jsonp" or "text".
// 3. The urltag is name for marking url and distinguish different urls in PageProcesser and Pipeline.
// 4. The method is POST or GET.
// 5. The postdata is body string sent to sever.
// 6. The header is header for http request.
// 7. Cookies
// 8. Http redirect function
req := request.NewRequest("http://backadmin.hucong.net/main/user/login", "html", "site_login", "POST", post_arg.Encode(), header, nil, myRedirect, nil)
spider.NewSpider(NewMyPageProcesser(), "TaskName").
AddRequest(req).
AddPipeline(pipeline.NewPipelineConsole()). // Print result on screen
SetThreadnum(3). // Crawl request by three Coroutines
Run()
}