From 82f8e5617ef787b9e3ea77aca16bd98c16770fc6 Mon Sep 17 00:00:00 2001 From: wanderor Date: Sat, 21 Sep 2024 15:43:27 +0800 Subject: [PATCH 1/2] Enhance detection of lazy image This code change: - Enhances detection of lazy image to support Wechat web pages. It also covers herald sun in original test data. - Adds a new overwriteImgSrc option to allow caller to overwrite img src with data-src more aggressively. - Refactors _fixLazyImages() to smaller functions to reduce duplicated and inconsistent logics. --- Readability.js | 161 ++++++++++-------- test/test-pages/herald-sun-1/expected.html | 4 +- .../wechat-image/expected-metadata.json | 9 + test/test-pages/wechat-image/expected.html | 4 + test/test-pages/wechat-image/source.html | 10 ++ 5 files changed, 116 insertions(+), 72 deletions(-) create mode 100644 test/test-pages/wechat-image/expected-metadata.json create mode 100644 test/test-pages/wechat-image/expected.html create mode 100644 test/test-pages/wechat-image/source.html diff --git a/Readability.js b/Readability.js index ccad9710..2181af18 100644 --- a/Readability.js +++ b/Readability.js @@ -64,6 +64,8 @@ function Readability(doc, options) { this._disableJSONLD = !!options.disableJSONLD; this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos; this._linkDensityModifier = options.linkDensityModifier || 0; + // If true, will always overwrite img src with found data-src attribute. + this._overwriteImgSrc = !!options.overwriteImgSrc; // Start with all flags set this._flags = @@ -108,6 +110,10 @@ function Readability(doc, options) { } } +// Helper: OR multiple regexps to one. +_combineRegExps = (...regexps) => + new RegExp(regexps.map(regexp => regexp.source).join("|")) + Readability.prototype = { FLAG_STRIP_UNLIKELYS: 0x1, FLAG_WEIGHT_CLASSES: 0x2, @@ -172,6 +178,15 @@ Readability.prototype = { /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu, loadingWords: /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu, + // used to identify img data-src attribute: + imgSrcset: + /\.(jpg|jpeg|png|webp)\s+\d/, + imgSrc: _combineRegExps( + /^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/, + /^\s*https?:\/\/\S+=(jpg|jpeg|png|webp)\S*\s*$/), + // used to identify lazy img src (aka placeholder) + lazyImgSrc: + /svg\s+(width|height)=['"]?1(px)?['"]?\s+/ }, UNLIKELY_ROLES: [ @@ -2296,85 +2311,91 @@ Readability.prototype = { } }, + /** + * Look for the first data-src like property. If found, convert image/figure + * element into image that can be loaded without JS, and return true. + * Otherwise return false. + */ + _fixLazyImage(elem, dry_run) { + for (var j = 0; j < elem.attributes.length; j++) { + attr = elem.attributes[j]; + if ( + attr.name === "src" || + attr.name === "srcset" || + attr.name === "alt" + ) { + continue; + } + var copyTo = null; + if (this.REGEXPS.imgSrcset.test(attr.value)) { + copyTo = "srcset"; + } else if (this.REGEXPS.imgSrc.test(attr.value)) { + copyTo = "src"; + } + if (copyTo) { + if (!dry_run) { + //if this is an img or picture, set the attribute directly + if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { + elem.setAttribute(copyTo, attr.value); + } else if ( + elem.tagName === "FIGURE" && + !this._getAllNodesWithTag(elem, ["img", "picture"]).length + ) { + //if the item is a
that does not contain an image or picture, create + //one and place it inside the figure see the nytimes-3 testcase for an example + var img = this._doc.createElement("img"); + img.setAttribute(copyTo, attr.value); + elem.appendChild(img); + } + } + return true; + } + } + return false; + }, + + /** + * In some sites (e.g. Kotaku, Wechat), they put 1px square image as data uri (base64 + * or not) in the src attribute. So, here we check if the data uri is too short, width + * or hight is 1, just might as well remove it. + */ + _maybeRemoveImgSrc(elem) { + if (!elem.src) return; + + var parts = this.REGEXPS.b64DataUrl.exec(elem.src); + if (parts != null) { // base64 encoded + // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. + if (parts[1] === "image/svg+xml") return; + // Here we assume if image is less than 100 bytes (or 133B after encoded to base64) + // it will be too small, therefore it might be placeholder image. + var b64starts = elem.src.search(/base64\s*/i) + 7; + var b64length = elem.src.length - b64starts; + if (b64length >= 133) return; + } else { // not base64 encoded + if (!this.REGEXPS.lazyImgSrc.test(elem.src)) return; + } + + if (this._fixLazyImage(elem, true)) { // src could be removed + elem.removeAttribute("src"); + } + }, + /* convert images and figures that have properties like data-src into images that can be loaded without JS */ _fixLazyImages(root) { this._forEachNode( this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) { - // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. - // So, here we check if the data uri is too short, just might as well remove it. - if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) { - // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. - var parts = this.REGEXPS.b64DataUrl.exec(elem.src); - if (parts[1] === "image/svg+xml") { - return; - } - - // Make sure this element has other attributes which contains image. - // If it doesn't, then this src is important and shouldn't be removed. - var srcCouldBeRemoved = false; - for (var i = 0; i < elem.attributes.length; i++) { - var attr = elem.attributes[i]; - if (attr.name === "src") { - continue; - } - - if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { - srcCouldBeRemoved = true; - break; - } - } - - // Here we assume if image is less than 100 bytes (or 133B after encoded to base64) - // it will be too small, therefore it might be placeholder image. - if (srcCouldBeRemoved) { - var b64starts = elem.src.search(/base64\s*/i) + 7; - var b64length = elem.src.length - b64starts; - if (b64length < 133) { - elem.removeAttribute("src"); - } - } - } - - // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 - if ( - (elem.src || (elem.srcset && elem.srcset != "null")) && - !elem.className.toLowerCase().includes("lazy") - ) { - return; - } - - for (var j = 0; j < elem.attributes.length; j++) { - attr = elem.attributes[j]; + if (!this._overwriteImgSrc) { // overwrite is conditional, not forced + this._maybeRemoveImgSrc(elem); + // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 if ( - attr.name === "src" || - attr.name === "srcset" || - attr.name === "alt" + (elem.src || (elem.srcset && elem.srcset != "null")) && + !elem.className.toLowerCase().includes("lazy") ) { - continue; - } - var copyTo = null; - if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { - copyTo = "srcset"; - } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { - copyTo = "src"; - } - if (copyTo) { - //if this is an img or picture, set the attribute directly - if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { - elem.setAttribute(copyTo, attr.value); - } else if ( - elem.tagName === "FIGURE" && - !this._getAllNodesWithTag(elem, ["img", "picture"]).length - ) { - //if the item is a
that does not contain an image or picture, create one and place it inside the figure - //see the nytimes-3 testcase for an example - var img = this._doc.createElement("img"); - img.setAttribute(copyTo, attr.value); - elem.appendChild(img); - } + return; } } + this._fixLazyImage(elem, false); } ); }, diff --git a/test/test-pages/herald-sun-1/expected.html b/test/test-pages/herald-sun-1/expected.html index b5e6ea0e..ac0b4238 100644 --- a/test/test-pages/herald-sun-1/expected.html +++ b/test/test-pages/herald-sun-1/expected.html @@ -1,7 +1,7 @@
-

A new Bill would require telecommunications service providers to store so-called ‘metadat +

A new Bill would require telecommunications service providers to store so-called ‘metadat

A new Bill would require telecommunications service providers to store so-called ‘metadata’ for two years. @@ -30,4 +30,4 @@

LAURIE OAKES IS THE NINE NETWORK POLITICAL EDITOR

-
\ No newline at end of file + diff --git a/test/test-pages/wechat-image/expected-metadata.json b/test/test-pages/wechat-image/expected-metadata.json new file mode 100644 index 00000000..161b678e --- /dev/null +++ b/test/test-pages/wechat-image/expected-metadata.json @@ -0,0 +1,9 @@ +{ + "title": "", + "byline": null, + "dir": null, + "excerpt": "This is a simplified Wechat page. The original page is dynamic and complex that JSDOMParser cannot parse.", + "siteName": null, + "publishedTime": null, + "readerable": false +} diff --git a/test/test-pages/wechat-image/expected.html b/test/test-pages/wechat-image/expected.html new file mode 100644 index 00000000..de48aedb --- /dev/null +++ b/test/test-pages/wechat-image/expected.html @@ -0,0 +1,4 @@ +
+

This is a simplified Wechat page. The original page is dynamic and complex that JSDOMParser cannot parse.

+

+
diff --git a/test/test-pages/wechat-image/source.html b/test/test-pages/wechat-image/source.html new file mode 100644 index 00000000..24977838 --- /dev/null +++ b/test/test-pages/wechat-image/source.html @@ -0,0 +1,10 @@ + + +

This is a simplified Wechat page. The original page is dynamic and complex that JSDOMParser cannot parse.

+

+ + + From 97d7d6a0a71b528eac127410fcff1ab29ab21c21 Mon Sep 17 00:00:00 2001 From: wanderor Date: Sat, 21 Sep 2024 16:10:44 +0800 Subject: [PATCH 2/2] Fix lint errors --- Readability.js | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Readability.js b/Readability.js index 2181af18..cdeb20db 100644 --- a/Readability.js +++ b/Readability.js @@ -2360,19 +2360,25 @@ Readability.prototype = { * or hight is 1, just might as well remove it. */ _maybeRemoveImgSrc(elem) { - if (!elem.src) return; + if (!elem.src) { + return; + } var parts = this.REGEXPS.b64DataUrl.exec(elem.src); if (parts != null) { // base64 encoded // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. - if (parts[1] === "image/svg+xml") return; + if (parts[1] === "image/svg+xml") { + return; + } // Here we assume if image is less than 100 bytes (or 133B after encoded to base64) // it will be too small, therefore it might be placeholder image. var b64starts = elem.src.search(/base64\s*/i) + 7; var b64length = elem.src.length - b64starts; - if (b64length >= 133) return; - } else { // not base64 encoded - if (!this.REGEXPS.lazyImgSrc.test(elem.src)) return; + if (b64length >= 133) { + return; + } + } else if (!this.REGEXPS.lazyImgSrc.test(elem.src)) { + return; } if (this._fixLazyImage(elem, true)) { // src could be removed