diff --git a/lib/crawler.js b/lib/crawler.js index 2eeb681..a154eff 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -35,6 +35,13 @@ exports.fetch = (host, route, regexp, codeLen) => const bufferHelper = new BufferHelper() const statusCode = res.statusCode + // 302 Move Temporarily + // 这种情况一般重试就可以了,所以视为超时统一重试处理 + if (statusCode === 302) { + res.resume() + return reject(new Error('timeout')) + } + if (statusCode !== 200) { res.resume() return reject(new Error('Request Failed. Status Code: ' + statusCode)) @@ -49,7 +56,11 @@ exports.fetch = (host, route, regexp, codeLen) => let current while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim() if (Object.keys(result).length === 0) { - return reject(new Error('Request Failed. rawData: '), rawData) + const raw = iconv.decode(bufferHelper.toBuffer(), 'UTF-8') + if (raw.includes('请开启JavaScript并刷新该页')) { + console.log('\n温馨提示:请求过于频繁已被目标网站限制,当前抓取进度已保存,请五分钟后再试...\n') + process.exit(0) + } } return resolve(result) diff --git a/lib/worker.js b/lib/worker.js index 7cc4568..39880d4 100644 --- a/lib/worker.js +++ b/lib/worker.js @@ -11,6 +11,11 @@ const limit = 100 * @datetime 2018-01-31 22:11 */ exports.fetchProvinces = async () => { + const count = await Province.count() + if (count !== 0) { + return + } + console.log('[1/1]正在抓取省级数据...') const o = await crawler.fetchProvinces() const rows = [] @@ -29,12 +34,19 @@ exports.fetchProvinces = async () => { exports.fetchCities = async () => { await exports.fetchProvinces() - const count = await Province.count() + const fetchedProvinceCode = await City.aggregate('provinceCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT) + const where = { code: { [Sequelize.Op.notIn]: fetchedProvinceCode } } + const count = await Province.count({ where }) + + if (count === 0) { + return + } + let index = 0 let hasNext = true let after while (hasNext) { - const r = await Province.paginate({ limit, after }) + const r = await Province.paginate({ where, limit, after }) const rows = [] for (let i = 0; i < r.results.length; i++) { const { dataValues: {