diff --git a/lib/routes/nytimes/index.js b/lib/routes/nytimes/index.js index c490c262f6a824..5af8329a362b78 100644 --- a/lib/routes/nytimes/index.js +++ b/lib/routes/nytimes/index.js @@ -24,6 +24,7 @@ module.exports = async (ctx) => { lang = 'dual'; } + const browser = await require('@/utils/puppeteer')(); const feed = await parser.parseURL(rssUrl); const items = await Promise.all( feed.items.splice(0, 10).map(async (item) => { @@ -64,11 +65,7 @@ module.exports = async (ctx) => { hasEnVersion = true; link = $('.dual-btn a').last().attr().href; - response = await ctx.cache.tryGet(`nyt: ${link}`, async () => { - const response = await got.get(link); - - return response.data; - }); + response = await utils.PuppeterGetter(ctx, browser, link); } } } @@ -103,6 +100,8 @@ module.exports = async (ctx) => { }) ); + browser.close(); + ctx.state.data = { title, link: 'https://cn.nytimes.com', diff --git a/lib/routes/nytimes/morning_post.js b/lib/routes/nytimes/morning_post.js index ea785b07132d1a..a9f477c7a82c88 100644 --- a/lib/routes/nytimes/morning_post.js +++ b/lib/routes/nytimes/morning_post.js @@ -3,48 +3,46 @@ const cheerio = require('cheerio'); const utils = require('./utils'); module.exports = async (ctx) => { - const url = 'https://m.cn.nytimes.com/morning-brief/'; + const url = 'https://www.nytimes.com/svc/collections/v1/publish/https://www.nytimes.com/zh-hans/series/daily-briefing-chinese/rss.xml'; const response = await got({ method: 'get', url, }); const data = response.data; const $ = cheerio.load(data); - const post = $('.article-list .regular-item') - .map((index, elem) => { - const $item = $(elem); - const $link = $item.find('a'); - - return { - title: $link.attr('title'), - link: $link.attr('href'), - }; - }) - .get(); + const post = $('item').map((index, elem) => { + const title = $(elem).find('title').text(); + const link = $(elem).find('link').next().text(); + return { + link: link, + title: title + }; + }).get(); - const items = await Promise.all( - post.map(async (item) => { - const link = item.link; - const result = await ctx.cache.tryGet(`nyt: ${link}`, async () => { - const response = await got.get(link); + const browser = await require('@/utils/puppeteer')(); - return utils.ProcessFeed(response.data); - }); + const items = await Promise.all( + post.map( + async (item) => { + // use puppeter cause all the image is lazy-load + const result = utils.ProcessFeed(await utils.PuppeterGetter(ctx, browser, item.link), true); - item.pubDate = result.pubDate; + item.pubDate = result.pubDate; - // Match 感谢|謝.*?cn.letters@nytimes.com。 - const ending = /感(谢|謝);.*?cn\.letters@nytimes\.com。/g; + // Match 感谢|謝.*?cn.letters@nytimes.com。 + const ending = /感(谢|謝);.*?cn\.letters@nytimes\.com。/g; - const matching = '
'; - const formatted = '
' + matching; + const matching = '
'; + const formatted = '
' + matching; - item.description = result.description.replace(ending, '').split(matching).join(formatted); + item.description = result.description.replace(ending, '').split(matching).join(formatted); - return Promise.resolve(item); - }) + return Promise.resolve(item); + }) ); + browser.close(); + ctx.state.data = { title: '纽约时报中文网|每日简报', link: url, diff --git a/lib/routes/nytimes/utils.js b/lib/routes/nytimes/utils.js index 2ec7f6806a99ff..584f3b19f73a61 100644 --- a/lib/routes/nytimes/utils.js +++ b/lib/routes/nytimes/utils.js @@ -1,9 +1,9 @@ const cheerio = require('cheerio'); const ProcessImage = ($, e) => { - const photo = $(e).find('figure'); + const photo = $(e).find('figure').find('picture').find('img'); - let cover = `

`; + let cover = `

`; const caption = $(e).find('figcaption'); @@ -12,6 +12,19 @@ const ProcessImage = ($, e) => { return cover; }; +const PuppeterGetter = async (ctx, browser, link) => { + const result = await ctx.cache.tryGet(`nyt: ${link}`, async () => { + const page = await browser.newPage(); + await page.goto(link); + const response = await page.evaluate( + () => + document.querySelector('body').innerHTML + ); + return response; + }); + return result; +}; + const ProcessFeed = (data, hasEnVersion = false) => { const $ = cheerio.load(data); @@ -40,6 +53,14 @@ const ProcessFeed = (data, hasEnVersion = false) => { $(e).remove(); }); + // remove ad + content.find('#CNB').each((i, e) => { + $(e).next().remove(); + }); + + content.find('div[id]').each((i, e) => { + $(e).remove(); + }); // remove useless DOMs content.find('aside').each((i, e) => { $(e).remove(); @@ -79,4 +100,5 @@ const ProcessFeed = (data, hasEnVersion = false) => { module.exports = { ProcessFeed, + PuppeterGetter, };