diff --git a/lib/config.js b/lib/config.js index 0c743aa4ff2573..aa213744bce43b 100644 --- a/lib/config.js +++ b/lib/config.js @@ -28,7 +28,9 @@ const value = { }, longWarningThreshold: parseFloat(envs.LONG_WARNING_THRESHOLD, 10) || 5, // in seconds prefetch: Boolean(envs.PRE_FETCH), - ua: envs.UA || "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", + ua: + envs.UA || + "Mozilla/5.0 (Linux; Android 10; Redmi Note 8 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36", listenInaddrAny: envs.LISTEN_INADDR_ANY || 1, // 是否允许公网连接,取值 0 1 requestRetry: parseInt(envs.REQUEST_RETRY) || 2, // 请求失败重试次数 // 是否显示 Debug 信息,取值 boolean 'false' 'key' ,取值为 'false' false 时永远不显示,取值为 'key' 时带上 ?debug=key 显示 diff --git a/lib/customize/bbc/index.js b/lib/customize/bbc/index.js new file mode 100644 index 00000000000000..16df35bd33f805 --- /dev/null +++ b/lib/customize/bbc/index.js @@ -0,0 +1,15 @@ +const { createGenericEndpoint, linkExtractors, contentExtractors } = require("@/utils/common-utils"); +const { fetchTextWithCrossWallProxy } = require("@/utils/http"); + +const endpoint = createGenericEndpoint({ + feedTitle: "BBC News", + endpointPath: "/bbc-news", + translateTitle: true, + language: "en", + entryUrl: "https://feeds.bbci.co.uk/news/rss.xml", + fetchText: fetchTextWithCrossWallProxy, + linkExtractor: linkExtractors.feedXmlLinkExtractor(), + contentExtractor: contentExtractors.js, +}); + +module.exports = endpoint; diff --git a/lib/utils/common-utils.js b/lib/utils/common-utils.js index f24d3886a22080..3e302b7742ede1 100644 --- a/lib/utils/common-utils.js +++ b/lib/utils/common-utils.js @@ -10,13 +10,34 @@ const toTitleCase = (str) => .map((word) => word.replace(word[0], word[0].toUpperCase())) .join(" "); -function domAElementLinkExtractor(selector, prefix = "") { +function domAElementLinkExtractor(selector = "a", prefix = "") { return ($) => $(selector) .get() .map((listItem) => prefix + $(listItem).attr("href")); } +function fuzzyALinkSelector(matchReg, prefix = "") { + return ($) => + $("a") + .get() + .map((listItem) => $(listItem).attr("href")) + .filter((href) => { + const match = href.match(matchReg); + logger.debug(`fuzzyALinkSelector: ${href} ${match}`); + return match; + }) + .map((href) => prefix + href); +} + +function feedXmlLinkExtractor() { + // $ is a valid rss feed content + return ($) => + $("item link") + .get() + .map((link) => link?.next?.data?.trim()); +} + /** * * @param {cheerio.Root} page @@ -99,7 +120,7 @@ function createGenericEndpoint(options) { ctx.state.skip_pure = options.skipPure; if (options.linkExtractor && (options.contentExtractor || options.jsonExtractor)) { - const links = uniq(options.linkExtractor($).slice(0, options.maxItemsInList)); + const links = uniq(options.linkExtractor($)).slice(0, options.maxItemsInList); if (links.length === 0) { logger.warn("no links found", { baseUrl: entryUrlValue }); @@ -120,7 +141,8 @@ function createGenericEndpoint(options) { : await options.jsonExtractor(JSON.parse(content)); if (article === undefined) { - logger.warn("no content for link", link); + logger.warn("no content for link", link, "extractor", options.contentExtractor?.name); + logger.debug("content is", content); return undefined; } @@ -202,6 +224,8 @@ module.exports = { linkExtractors: { domAElementLinkExtractor, aLinkExtractor: domAElementLinkExtractor, + fuzzyALinkSelector, + feedXmlLinkExtractor, }, contentExtractors: { jsContentExtractor, diff --git a/lib/utils/http.js b/lib/utils/http.js index 1d4bb1d12bf120..3649227a1a5720 100644 --- a/lib/utils/http.js +++ b/lib/utils/http.js @@ -1,4 +1,5 @@ const config = require("@/config"); +const logger = require("./logger"); function fetchText(url, encoding = "utf-8") { return fetch(url, { @@ -8,7 +9,12 @@ function fetchText(url, encoding = "utf-8") { referer: url, }, }) - .then((res) => res.arrayBuffer()) + .then((res) => { + if (res.status !== 200) { + logger.debug("fetch url", url, res.status, "headers", res.headers); + } + return res.arrayBuffer(); + }) .then((buff) => new TextDecoder(encoding).decode(buff)); } @@ -34,8 +40,16 @@ function fetchTextWithCrossWallProxy(url, encoding = "utf-8") { bodyTimeout: 10_000, maxRedirections: 2, }), + headers: { + "User-Agent": config.value.ua, + }, }) - .then((res) => res.arrayBuffer()) + .then((res) => { + if (res.status !== 200) { + logger.debug("fetch url", url, res.status, "headers", res.headers); + } + return res.arrayBuffer(); + }) .then((buff) => new TextDecoder(encoding).decode(buff)); } diff --git a/package-lock.json b/package-lock.json index 6c6b62c118d903..d06e26f809e8e0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,6 +13,7 @@ "@koa/router": "^12.0.1", "@newdash/newdash": "^5.23.1", "@node-rs/jieba": "^1.10.3", + "@xmldom/xmldom": "^0.8.10", "art-template": "4.13.2", "cheerio": "1.0.0-rc.12", "chrono-node": "^2.7.6", diff --git a/package.json b/package.json index d1b4263ebf0272..d95d2758778c33 100644 --- a/package.json +++ b/package.json @@ -47,6 +47,7 @@ "@koa/router": "^12.0.1", "@newdash/newdash": "^5.23.1", "@node-rs/jieba": "^1.10.3", + "@xmldom/xmldom": "^0.8.10", "art-template": "4.13.2", "cheerio": "1.0.0-rc.12", "chrono-node": "^2.7.6",