Skip to content

Commit

Permalink
Skip empty (empty innerHTML) nodes when grabbing article
Browse files Browse the repository at this point in the history
  • Loading branch information
nicofrand committed May 25, 2019
1 parent f808c1b commit ff78c63
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -1008,6 +1008,12 @@ protected function grabArticle(\DOMElement $page = null)
for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
$tagName = $node->tagName;

$nodeContent = $node->getInnerHTML();
if (empty($nodeContent)) {
$this->logger->debug('Skipping empty node');
continue;
}

// Some well known site uses sections as paragraphs.
if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
$nodesToScore[] = $node;
Expand All @@ -1016,11 +1022,11 @@ protected function grabArticle(\DOMElement $page = null)
// Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements).
if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
if (!preg_match($this->regexps['divToPElements'], $node->getInnerHTML())) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
$newNode = $this->dom->createElement('p');

try {
$newNode->setInnerHtml($node->getInnerHTML());
$newNode->setInnerHtml($nodeContent);

$node->parentNode->replaceChild($newNode, $node);
--$nodeIndex;
Expand Down

0 comments on commit ff78c63

Please sign in to comment.