Merge pull request #52 from nicofrand/master

Skip empty (empty innerHTML) nodes when grabbing article
j0k3r · Mar 9, 2021 · 9a490fa · 9a490fa
2 parents 6f6b1f9 + ff78c63
commit 9a490fa
Showing 1 changed file with 8 additions and 2 deletions.
diff --git a/src/Readability.php b/src/Readability.php
@@ -993,6 +993,12 @@ protected function grabArticle(\DOMElement $page = null)
         for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
             $tagName = $node->tagName;
 
+            $nodeContent = $node->getInnerHTML();
+            if (empty($nodeContent)) {
+                $this->logger->debug('Skipping empty node');
+                continue;
+            }
+
             // Some well known site uses sections as paragraphs.
             if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
                 $nodesToScore[] = $node;
@@ -1001,11 +1007,11 @@ protected function grabArticle(\DOMElement $page = null)
             // Turn divs into P tags where they have been used inappropriately
             //  (as in, where they contain no other block level elements).
             if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
-                if (!preg_match($this->regexps['divToPElements'], $node->getInnerHTML())) {
+                if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
                     $newNode = $this->dom->createElement('p');
 
                     try {
-                        $newNode->setInnerHtml($node->getInnerHTML());
+                        $newNode->setInnerHtml($nodeContent);
 
                         $node->parentNode->replaceChild($newNode, $node);
                         --$nodeIndex;