Skip to content
This repository has been archived by the owner on Mar 12, 2022. It is now read-only.

钢材价格爬虫 #48

Closed
klren0312 opened this issue Nov 11, 2019 · 2 comments
Closed

钢材价格爬虫 #48

klren0312 opened this issue Nov 11, 2019 · 2 comments

Comments

@klren0312
Copy link
Owner

后期整合进定时服务

image

const cheerio = require('cheerio')
const tableParser = require('cheerio-tableparser')
const rp = require('request-promise')
const url = 'http://hq.zgw.com/hefei/jiancai/2343364.html'

rp({
  uri: url
}).then(body => {
  const $ = cheerio.load(body)
  const priceCol = spiderTable($, false)[4]
  priceCol.shift()
  const pricesFormat = priceCol.map(v => $(v).data().type)
  const otherData = spiderTable($, true)
  const nameCol = otherData[0]
  const sizeCol = otherData[1]
  nameCol.shift()
  sizeCol.shift()
  const ironArr = []
  for (let i = 0, len = nameCol.length; i < len; i++) {
    ironArr.push({
      name: nameCol[i],
      size: sizeCol[i],
      price: pricesFormat[i]
    })
  }
  console.log(ironArr)
})

function spiderTable ($, hasHtml) {
  tableParser($)
  return $('body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div > div.lm_m > div.lm_mt > div.article > table').parsetable(false, false, hasHtml)
}
@klren0312 klren0312 self-assigned this Nov 11, 2019
@klren0312
Copy link
Owner Author

klren0312 commented Nov 12, 2019

爬取今日的价格

先从列表获取是否有当天的, 随后进入价格页抓取, 只要马钢的
image

const cheerio = require('cheerio')
const tableParser = require('cheerio-tableparser')
const rp = require('request-promise')

const complete = new Set()
const spider = async () => {
  const today = `${new Date().getMonth() + 1}-${zeroPadding(new Date().getDate())}`
  if (!complete.has(today)) {
    const link = await checkNew()
    if (link) {
      const res = await getPrice(link)
      complete.add(today)
      console.log(res)
    } else {
      console.log('今日无数据')
    }
  } else {
    console.log(`today(${date}) is complete`)
  }
}

spider()

/**
 * 判断是否有当天数据
 */
async function checkNew () {
  const listUrl = 'http://hq.zgw.com/hefei/jiancai.html'
  const body = await rp({ uri: listUrl })
  const $ = cheerio.load(body)
  let list = $('body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div.lm_list > ul:nth-child(2)').children('li:first-child').text().trim()
  if (list.match(/(\d+-\d+)/)[0] === `${new Date().getMonth() + 1}-${zeroPadding(new Date().getDate())}`) {
    const link = $('body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div.lm_list > ul:nth-child(2) li:first-child a').attr('href')
    return `http://hq.zgw.com${link}`
  } else {
    return false
  }
}

/**
 * 获取价格
 * @param {string} url 价格页链接
 */
async function getPrice (url) {
  const tableSelector = 'body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div > div.lm_m > div.lm_mt > div.article > table'
  const body = await rp({ uri: url })
  const $ = cheerio.load(body)
  const priceCol = spiderTable(tableSelector, $, false)[4]
  priceCol.shift()
  const pricesFormat = priceCol.map(v => $(v).data().type)
  const otherData = spiderTable(tableSelector, $, true)
  const nameCol = otherData[0]
  const sizeCol = otherData[1]
  const modelCol = otherData[2]
  const typeCol = otherData[3]
  nameCol.shift()
  sizeCol.shift()
  modelCol.shift()
  typeCol.shift()
  const ironArr = []
  for (let i = 0, len = nameCol.length; i < len; i++) {
    ironArr.push({
      name: nameCol[i],
      size: sizeCol[i],
      model: modelCol[i],
      type: typeCol[i],
      price: pricesFormat[i]
    })
  }
  let afterArr = ironArr.filter(v => v.type.indexOf('马钢') !== -1)
  afterArr = afterArr.filter(v => v.model !== 'HRB400E')
  afterArr = afterArr.filter(v => v.name.indexOf('高线') === -1)
  return afterArr
}

/**
 * 数字补零
 * @param {number} num 数字
 */
function zeroPadding (num) {
  let n = num.toString()
  if (n.length === 1) {
    return `0${n}`
  } else {
    return n
  }
}

/**
 * 解析表格数据
 * @param {string} tableSelector 表格选择器
 * @param {function} $ cheerio
 * @param {boolean} noHtml 是否需要清除html标签
 */
function spiderTable (tableSelector, $, noHtml) {
  tableParser($)
  return $(tableSelector).parsetable(false, false, noHtml)
}

@klren0312
Copy link
Owner Author

klren0312 commented Nov 12, 2019

添加定时任务

const cheerio = require('cheerio')
const tableParser = require('cheerio-tableparser')
const rp = require('request-promise')
const schedule = require('node-schedule')

let scheduleWork = null
const complete = new Set()
scheduleWork = schedule.scheduleJob({hour: 12, minute: 30}, async () => {
  const today = `${new Date().getMonth() + 1}-${zeroPadding(new Date().getDate())}`
  console.log(complete, complete.has(today))
  if (!complete.has(today)) {
    const link = await checkNew()
    if (link) {
      const res = await getPrice(link)
      complete.add(today)
      console.log(res)
    }
  } else {
    console.log(`today(${date}) is complete`)
  }
})

/**
 * 判断是否有当天数据
 */
async function checkNew () {
  const listUrl = 'http://hq.zgw.com/hefei/jiancai.html'
  const body = await rp({ uri: listUrl })
  const $ = cheerio.load(body)
  let list = $('body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div.lm_list > ul:nth-child(2)').children('li:first-child').text().trim()
  if (list.match(/(\d+-\d+)/)[0] === `${new Date().getMonth() + 1}-${zeroPadding(new Date().getDate())}`) {
    const link = $('body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div.lm_list > ul:nth-child(2) li:first-child a').attr('href')
    return `http://hq.zgw.com${link}`
  } else {
    return false
  }
}

/**
 * 获取价格
 * @param {string} url 价格页链接
 */
async function getPrice (url) {
  const tableSelector = 'body > div.wrap > div.cslm_tit > div.hq_con > div.fl.lm_left > div > div.lm_m > div.lm_mt > div.article > table'
  const body = await rp({ uri: url })
  const $ = cheerio.load(body)
  const priceCol = spiderTable(tableSelector, $, false)[4]
  priceCol.shift()
  const pricesFormat = priceCol.map(v => $(v).data().type)
  const otherData = spiderTable(tableSelector, $, true)
  const nameCol = otherData[0]
  const sizeCol = otherData[1]
  nameCol.shift()
  sizeCol.shift()
  const ironArr = []
  for (let i = 0, len = nameCol.length; i < len; i++) {
    ironArr.push({
      name: nameCol[i],
      size: sizeCol[i],
      price: pricesFormat[i]
    })
  }
  return ironArr
}

/**
 * 数字补零
 * @param {number} num 数字
 */
function zeroPadding (num) {
  let n = num.toString()
  if (n.length === 1) {
    return `0${n}`
  } else {
    return n
  }
}

/**
 * 解析表格数据
 * @param {string} tableSelector 表格选择器
 * @param {function} $ cheerio
 * @param {boolean} noHtml 是否需要清除html标签
 */
function spiderTable (tableSelector, $, noHtml) {
  tableParser($)
  return $(tableSelector).parsetable(false, false, noHtml)
}

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
None yet
Development

No branches or pull requests

1 participant