From ff78c63e6d0b9ff1348d84a491536130ec7c387a Mon Sep 17 00:00:00 2001 From: nicofrand Date: Sat, 25 May 2019 16:12:52 +0200 Subject: [PATCH] Skip empty (empty innerHTML) nodes when grabbing article --- src/Readability.php | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 4a855ca..a8c5519 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1008,6 +1008,12 @@ protected function grabArticle(\DOMElement $page = null) for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) { $tagName = $node->tagName; + $nodeContent = $node->getInnerHTML(); + if (empty($nodeContent)) { + $this->logger->debug('Skipping empty node'); + continue; + } + // Some well known site uses sections as paragraphs. if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) { $nodesToScore[] = $node; @@ -1016,11 +1022,11 @@ protected function grabArticle(\DOMElement $page = null) // Turn divs into P tags where they have been used inappropriately // (as in, where they contain no other block level elements). if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) { - if (!preg_match($this->regexps['divToPElements'], $node->getInnerHTML())) { + if (!preg_match($this->regexps['divToPElements'], $nodeContent)) { $newNode = $this->dom->createElement('p'); try { - $newNode->setInnerHtml($node->getInnerHTML()); + $newNode->setInnerHtml($nodeContent); $node->parentNode->replaceChild($newNode, $node); --$nodeIndex;