From ff78c63e6d0b9ff1348d84a491536130ec7c387a Mon Sep 17 00:00:00 2001
From: nicofrand <mail@nicofrand.eu>
Date: Sat, 25 May 2019 16:12:52 +0200
Subject: [PATCH] Skip empty (empty innerHTML) nodes when grabbing article

---
 src/Readability.php | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/Readability.php b/src/Readability.php
index 4a855ca..a8c5519 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -1008,6 +1008,12 @@ protected function grabArticle(\DOMElement $page = null)
         for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
             $tagName = $node->tagName;
 
+            $nodeContent = $node->getInnerHTML();
+            if (empty($nodeContent)) {
+                $this->logger->debug('Skipping empty node');
+                continue;
+            }
+
             // Some well known site uses sections as paragraphs.
             if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
                 $nodesToScore[] = $node;
@@ -1016,11 +1022,11 @@ protected function grabArticle(\DOMElement $page = null)
             // Turn divs into P tags where they have been used inappropriately
             //  (as in, where they contain no other block level elements).
             if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
-                if (!preg_match($this->regexps['divToPElements'], $node->getInnerHTML())) {
+                if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
                     $newNode = $this->dom->createElement('p');
 
                     try {
-                        $newNode->setInnerHtml($node->getInnerHTML());
+                        $newNode->setInnerHtml($nodeContent);
 
                         $node->parentNode->replaceChild($newNode, $node);
                         --$nodeIndex;