Skip to content

Commit

Permalink
Merge pull request #193 from j0k3r/feature/page-contains-next-page-link
Browse files Browse the repository at this point in the history
Handle "if_page_contains" for "next_page_link"
  • Loading branch information
j0k3r authored Jan 22, 2019
2 parents fb5192c + 1b271a4 commit c9e85d2
Show file tree
Hide file tree
Showing 7 changed files with 4,986 additions and 11 deletions.
12 changes: 12 additions & 0 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,18 @@ public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy =
// try to get next page link
// @todo: should we test if the link is actually a link?
foreach ($this->siteConfig->next_page_link as $pattern) {
// Do we have conditions?
$condition = $this->siteConfig->getIfPageContainsCondition('next_page_link', $pattern);

if ($condition) {
$elems = $this->xpath->evaluate($condition, $this->readability->dom);

// move on to next next_page_link XPath in case condition isn't met
if (!($elems instanceof \DOMNodeList && $elems->length > 0)) {
continue;
}
}

$elems = $this->xpath->evaluate($pattern, $this->readability->dom);

if (\is_string($elems)) {
Expand Down
4 changes: 2 additions & 2 deletions src/Graby.php
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ private function doFetchContent($url)

$res['html'] = $this->cleanupHtml($contentBlock, $effectiveUrl);

$this->logger->info('Returning data (most interesting ones): {data}', ['data' => ($res + ['html' => \strlen($res['html'])])]);
$this->logger->info('Returning data (most interesting ones): {data}', ['data' => ['html' => '(only length for debug): ' . \strlen($res['html'])] + $res]);

return $res;
}
Expand Down Expand Up @@ -681,7 +681,7 @@ private function getSinglePage($html, $url)
if ($condition) {
$elems = $xpath->evaluate($condition, $readability->dom);

// move on to next single page link XPath in case condition isn't met
// move on to next single_page_link XPath in case condition isn't met
if (!($elems instanceof \DOMNodeList && $elems->length > 0)) {
continue;
}
Expand Down
25 changes: 19 additions & 6 deletions src/SiteConfig/ConfigBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -377,17 +377,30 @@ public function parseLines(array $lines)
return $config;
}

// Add if_page_page_contains
// TODO: Expand so it can be used with other rules too
/**
* Build `if_page_contains` rule based on other previous rules defined for:
* - single_page_link.
* - next_page_link.
*
* First one has priority over the next one.
*
* @param SiteConfig $config Current config
* @param string $condition XPath condition
*/
private function handleIfPageContainsCondition(SiteConfig $config, $condition)
{
if (empty($config->single_page_link)) {
if (!empty($config->single_page_link)) {
$rule = 'single_page_link';
} elseif (!empty($config->next_page_link)) {
$rule = 'next_page_link';
} else {
// no link found, we can't apply "if_page_contains"
return;
}

$key = end($config->single_page_link);
reset($config->single_page_link);
$key = end($config->$rule);
reset($config->$rule);

$config->if_page_contains['single_page_link'][$key] = (string) $condition;
$config->if_page_contains[$rule][$key] = (string) $condition;
}
}
4 changes: 2 additions & 2 deletions src/SiteConfig/SiteConfig.php
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class SiteConfig
public $test_url = [];

// If page contains - XPath expression. Used to determine if the preceding rule gets evaluated or not.
// Currently only works with single_page_link.
// Currently only works with single_page_link & next_page_link (first one has priority over the second one).
public $if_page_contains = [];

// Single-page link - should identify a link element or URL pointing to the page holding the entire article
Expand Down Expand Up @@ -215,7 +215,7 @@ public function autodetect_on_failure($use_default = true)
/**
* Return a condition for the given name (if exists).
*
* @param string $name Rule name (only single_page_link is supported for now)
* @param string $name Rule name (only single_page_link & next_page_link is supported for now)
* @param string $value Value of the rule (currently only an url)
*
* @return string|null
Expand Down
39 changes: 38 additions & 1 deletion tests/GrabyTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -1637,7 +1637,7 @@ public function testAuthors($url, $file, $expectedAuthors)
/**
* Validated using the site_config in "tests/fixtures".
*/
public function testIfPageContains()
public function testIfPageContainsWithSinglePageLink()
{
$graby = $this->getGrabyWithMock(
'/fixtures/content/timothysykes-keepol.html',
Expand Down Expand Up @@ -1670,6 +1670,43 @@ public function testIfPageContains()
$this->assertSame(200, $res['status']);
}

/**
* Validated using the site_config in "tests/fixtures".
*/
public function testIfPageContainsWithNextPageLink()
{
$graby = $this->getGrabyWithMock(
'/fixtures/content/rollingstone.html',
200,
[
'debug' => true,
'extractor' => [
'config_builder' => [
'site_config' => [__DIR__ . '/fixtures/site_config'],
],
],
]
);
$res = $graby->fetchContent('https://www.rollingstone.com/?redirurl=/politics/news/greed-and-debt-the-true-story-of-mitt-romney-and-bain-capital-20120829');

$this->assertCount(12, $res);

$this->assertArrayHasKey('status', $res);
$this->assertArrayHasKey('html', $res);
$this->assertArrayHasKey('title', $res);
$this->assertArrayHasKey('language', $res);
$this->assertArrayHasKey('date', $res);
$this->assertArrayHasKey('authors', $res);
$this->assertArrayHasKey('url', $res);
$this->assertArrayHasKey('content_type', $res);
$this->assertArrayHasKey('summary', $res);
$this->assertArrayHasKey('open_graph', $res);
$this->assertArrayHasKey('native_ad', $res);
$this->assertArrayHasKey('all_headers', $res);

$this->assertSame(200, $res['status']);
}

/**
* Return an instance of graby with a mocked Guzzle client returning data from a predefined file.
*/
Expand Down
Loading

0 comments on commit c9e85d2

Please sign in to comment.