Skip to content

Commit

Permalink
MINOR: Adding methods internalLinks and externalLinks in (Fixes #4)
Browse files Browse the repository at this point in the history
  • Loading branch information
spekulatius committed May 12, 2020
1 parent 9feb37a commit 193f422
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 7 deletions.
41 changes: 36 additions & 5 deletions src/phpscraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ public function title()
public function charset()
{
// a bit more complex, as I didn't get the XPath working proper...
$filteredList = array_filter(
$filteredList = array_values(array_filter(
// 1. Get all attributes "charset"
$this->filter('//meta')->extract(['charset']),

Expand All @@ -233,7 +233,7 @@ function ($charset) {
return $charset == '';
},
ARRAY_FILTER_USE_KEY
);
));

return count($filteredList) == 0 ? null : $filteredList[0];
}
Expand Down Expand Up @@ -503,10 +503,10 @@ public function paragraphs()
*/
public function cleanParagraphs()
{
return array_filter(
return array_values(array_filter(
$this->paragraphs(),
function($paragraph) { return ($paragraph != ''); }
);
));
}

/**
Expand Down Expand Up @@ -560,7 +560,7 @@ public function cleanOutlineWithParagraphs()
}

/**
* get all links on the page with absolute URLs
* get all links on the page as absolute URLs
*
* @return array
*/
Expand All @@ -577,6 +577,37 @@ public function links()
return $result;
}

/**
* get all internal links on the page as absolute URLs
*
* @return array
*/
public function internalLinks()
{
// get the current host - to compare against for internal links
$host = parse_url($this->currentURL(), PHP_URL_HOST);

// filter the array
return array_values(array_filter(
$this->links(),
function($link) use (&$host) { return ($host === parse_url($link, PHP_URL_HOST)); }
));
}

/**
* get all internal links on the page as absolute URLs
*
* @return array
*/
public function externalLinks()
{
// diff the array
return array_diff(
$this->links(),
$this->internalLinks()
);
}

/**
* get all links on the page with commonly interesting details
*
Expand Down
34 changes: 34 additions & 0 deletions tests/LinkTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -246,4 +246,38 @@ public function testBaseHref()
]
], $web->linksWithDetails);
}

/**
* @test
*/
public function testInternalLinks()
{
$web = new \spekulatius\phpscraper();

// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/links/base-href.html');

// Check the complex links list
$this->assertSame(
['https://test-pages.phpscraper.de/assets/cat.jpg'],
$web->internalLinks
);
}

/**
* @test
*/
public function testExternalLinks()
{
$web = new \spekulatius\phpscraper();

// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/links/base-href.html');

// Check the complex links list
$this->assertSame(
['https://placekitten.com/408/287'],
$web->externalLinks
);
}
}
2 changes: 1 addition & 1 deletion websites/.vuepress/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ module.exports = {
serviceWorker: true,
updatePopup: true
}
]
]
],
themeConfig: {
repo: 'spekulatius/phpscraper',
Expand Down
2 changes: 1 addition & 1 deletion websites/examples/scrape-social-media-meta-tags.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,4 @@ $data = $web->twitterCard;
*/
```

In similar fashion to Open Graph, the array will be empty if not Twitter Card tags have been found.
In similar fashion to Open Graph, the array will be empty if no Twitter Card tags have been found.

0 comments on commit 193f422

Please sign in to comment.