Skip to content

Commit

Permalink
Merge pull request #52 from nicofrand/master
Browse files Browse the repository at this point in the history
Skip empty (empty innerHTML) nodes when grabbing article
  • Loading branch information
j0k3r authored Mar 9, 2021
2 parents 6f6b1f9 + ff78c63 commit 9a490fa
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/Readability.php
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,12 @@ protected function grabArticle(\DOMElement $page = null)
for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); ++$nodeIndex) {
$tagName = $node->tagName;

$nodeContent = $node->getInnerHTML();
if (empty($nodeContent)) {
$this->logger->debug('Skipping empty node');
continue;
}

// Some well known site uses sections as paragraphs.
if (0 === strcasecmp($tagName, 'p') || 0 === strcasecmp($tagName, 'td') || 0 === strcasecmp($tagName, 'pre') || 0 === strcasecmp($tagName, 'section')) {
$nodesToScore[] = $node;
Expand All @@ -1001,11 +1007,11 @@ protected function grabArticle(\DOMElement $page = null)
// Turn divs into P tags where they have been used inappropriately
// (as in, where they contain no other block level elements).
if (0 === strcasecmp($tagName, 'div') || 0 === strcasecmp($tagName, 'article') || 0 === strcasecmp($tagName, 'section')) {
if (!preg_match($this->regexps['divToPElements'], $node->getInnerHTML())) {
if (!preg_match($this->regexps['divToPElements'], $nodeContent)) {
$newNode = $this->dom->createElement('p');

try {
$newNode->setInnerHtml($node->getInnerHTML());
$newNode->setInnerHtml($nodeContent);

$node->parentNode->replaceChild($newNode, $node);
--$nodeIndex;
Expand Down

0 comments on commit 9a490fa

Please sign in to comment.