-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.js
125 lines (108 loc) · 3.28 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
'use strict';
var co = require('co');
var thunkify = require('thunkify');
var EventEmitter = require('pattern-emitter');
var request = require('request');
var _ = require('lodash');
var async = require('async');
var zlib = require('zlib');
var compose = require('composition');
var util = require('util');
var requestGet = thunkify(request);
var gunzip = thunkify(zlib.gunzip);
var CrawlerContext = function(options){
this.options = options;
this.url = options.url;
this.headers = options.headers || {};
this.callback = options.callback;
};
var Crawler = function(options){
EventEmitter.call(this);
let defaultOptions = {
concurrency: 3,
forceUTF8: true
};
let self = this;
this.options = _.extend(defaultOptions, options);
this.taskQueue = async.queue(self.process.bind(self), this.options.concurrency);
this.taskQueue.empty = ()=>this.emit('empty');
this.taskQueue.drain = ()=>this.emit('drain');
this.midwares = [];
};
util.inherits(Crawler, EventEmitter);
Crawler.prototype.queue = function(url, options, callback){
if (arguments.length == 1) {
if (typeof url == 'string') {
options = {
url: url
}
} else {
options = url;
}
} else if (arguments.length == 2) {
if (typeof url == 'string') {
if (typeof options == 'function') {
callback = options;
options = {
url: url,
callback: callback
};
} else {
options['url'] = url;
}
} else {
callback = options;
options = url;
options['callback'] = callback;
}
} else {
options['url'] = url;
options['callback'] = callback;
}
options = _.extend(this.options, options);
let task = new CrawlerContext(options);
this.taskQueue.push(task);
};
Crawler.prototype.process = function(task, callback) {
let self = this;
let middlewares = self.midwares.slice(0, self.midwares.length);
middlewares.push(self.send);
var fn = compose(middlewares);
var ctx = task;
fn.call(ctx).then(function(val) {
ctx.response && self.emit(ctx.url, ctx);
if (ctx.response && ctx.callback) {
ctx.callback();
}
callback();
}).catch(function(err){
callback(err);
});
};
Crawler.prototype.use = function(callback) {
this.midwares.push(callback);
};
Crawler.prototype.send = function* (next){
let options = _.pick(this.options, ['url']);
try {
let responses = yield requestGet(options);
let response = responses[0];
let body = responses[1];
response.url = response.request.href;
if (response.headers['content-encoding'] &&
response.headers['content-encoding'].toLowerCase().indexOf('gzip') >= 0) {
body = yield gunzip(response.body);
if (!options.forceUTF8) {
response.body = body.toString(req.encoding);
} else {
response.body = body;
}
}
this.response = response;
this.body = response.body;
return response;
} catch (e) {
console.log(e);
}
};
module.exports = Crawler;