Skip to content

Commit

Permalink
fix(route): nytimes (DIYgod#7449)
Browse files Browse the repository at this point in the history
use puppeter to load nytimes's image, fix morning post url, using official rss
Co-authored-by: Reki Dunois <[email protected]>
  • Loading branch information
RekiDunois authored May 25, 2021
1 parent 37ab8af commit 472b293
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 34 deletions.
9 changes: 4 additions & 5 deletions lib/routes/nytimes/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ module.exports = async (ctx) => {
lang = 'dual';
}

const browser = await require('@/utils/puppeteer')();
const feed = await parser.parseURL(rssUrl);
const items = await Promise.all(
feed.items.splice(0, 10).map(async (item) => {
Expand Down Expand Up @@ -64,11 +65,7 @@ module.exports = async (ctx) => {
hasEnVersion = true;
link = $('.dual-btn a').last().attr().href;

response = await ctx.cache.tryGet(`nyt: ${link}`, async () => {
const response = await got.get(link);

return response.data;
});
response = await utils.PuppeterGetter(ctx, browser, link);
}
}
}
Expand Down Expand Up @@ -103,6 +100,8 @@ module.exports = async (ctx) => {
})
);

browser.close();

ctx.state.data = {
title,
link: 'https://cn.nytimes.com',
Expand Down
52 changes: 25 additions & 27 deletions lib/routes/nytimes/morning_post.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,48 +3,46 @@ const cheerio = require('cheerio');
const utils = require('./utils');

module.exports = async (ctx) => {
const url = 'https://m.cn.nytimes.com/morning-brief/';
const url = 'https://www.nytimes.com/svc/collections/v1/publish/https://www.nytimes.com/zh-hans/series/daily-briefing-chinese/rss.xml';
const response = await got({
method: 'get',
url,
});
const data = response.data;
const $ = cheerio.load(data);
const post = $('.article-list .regular-item')
.map((index, elem) => {
const $item = $(elem);
const $link = $item.find('a');

return {
title: $link.attr('title'),
link: $link.attr('href'),
};
})
.get();
const post = $('item').map((index, elem) => {
const title = $(elem).find('title').text();
const link = $(elem).find('link').next().text();
return {
link: link,
title: title
};
}).get();

const items = await Promise.all(
post.map(async (item) => {
const link = item.link;
const result = await ctx.cache.tryGet(`nyt: ${link}`, async () => {
const response = await got.get(link);
const browser = await require('@/utils/puppeteer')();

return utils.ProcessFeed(response.data);
});
const items = await Promise.all(
post.map(
async (item) => {
// use puppeter cause all the image is lazy-load
const result = utils.ProcessFeed(await utils.PuppeterGetter(ctx, browser, item.link), true);

item.pubDate = result.pubDate;
item.pubDate = result.pubDate;

// Match 感谢|謝.*[email protected]
const ending = /&#x611F;(&#x8C22|&#x8B1D);.*?cn\.letters@nytimes\.com&#x3002;/g;
// Match 感谢|謝.*[email protected]
const ending = /&#x611F;(&#x8C22|&#x8B1D);.*?cn\.letters@nytimes\.com&#x3002;/g;

const matching = '<div class="article-paragraph">';
const formatted = '<br>' + matching;
const matching = '<div class="article-paragraph">';
const formatted = '<br>' + matching;

item.description = result.description.replace(ending, '').split(matching).join(formatted);
item.description = result.description.replace(ending, '').split(matching).join(formatted);

return Promise.resolve(item);
})
return Promise.resolve(item);
})
);

browser.close();

ctx.state.data = {
title: '纽约时报中文网|每日简报',
link: url,
Expand Down
26 changes: 24 additions & 2 deletions lib/routes/nytimes/utils.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
const cheerio = require('cheerio');

const ProcessImage = ($, e) => {
const photo = $(e).find('figure');
const photo = $(e).find('figure').find('picture').find('img');

let cover = `<figure><img src='${photo[0].attribs.itemid}'><br><figcaption>`;
let cover = `<figure><img src='${photo.attr('src')}'><br><figcaption>`;

const caption = $(e).find('figcaption');

Expand All @@ -12,6 +12,19 @@ const ProcessImage = ($, e) => {
return cover;
};

const PuppeterGetter = async (ctx, browser, link) => {
const result = await ctx.cache.tryGet(`nyt: ${link}`, async () => {
const page = await browser.newPage();
await page.goto(link);
const response = await page.evaluate(
() =>
document.querySelector('body').innerHTML
);
return response;
});
return result;
};

const ProcessFeed = (data, hasEnVersion = false) => {
const $ = cheerio.load(data);

Expand Down Expand Up @@ -40,6 +53,14 @@ const ProcessFeed = (data, hasEnVersion = false) => {
$(e).remove();
});

// remove ad
content.find('#CNB').each((i, e) => {
$(e).next().remove();
});

content.find('div[id]').each((i, e) => {
$(e).remove();
});
// remove useless DOMs
content.find('aside').each((i, e) => {
$(e).remove();
Expand Down Expand Up @@ -79,4 +100,5 @@ const ProcessFeed = (data, hasEnVersion = false) => {

module.exports = {
ProcessFeed,
PuppeterGetter,
};

0 comments on commit 472b293

Please sign in to comment.