diff --git a/src/phpscraper.php b/src/phpscraper.php index 79e0b00..b5fc5fc 100644 --- a/src/phpscraper.php +++ b/src/phpscraper.php @@ -224,7 +224,7 @@ public function title() public function charset() { // a bit more complex, as I didn't get the XPath working proper... - $filteredList = array_filter( + $filteredList = array_values(array_filter( // 1. Get all attributes "charset" $this->filter('//meta')->extract(['charset']), @@ -233,7 +233,7 @@ function ($charset) { return $charset == ''; }, ARRAY_FILTER_USE_KEY - ); + )); return count($filteredList) == 0 ? null : $filteredList[0]; } @@ -503,10 +503,10 @@ public function paragraphs() */ public function cleanParagraphs() { - return array_filter( + return array_values(array_filter( $this->paragraphs(), function($paragraph) { return ($paragraph != ''); } - ); + )); } /** @@ -560,7 +560,7 @@ public function cleanOutlineWithParagraphs() } /** - * get all links on the page with absolute URLs + * get all links on the page as absolute URLs * * @return array */ @@ -577,6 +577,37 @@ public function links() return $result; } + /** + * get all internal links on the page as absolute URLs + * + * @return array + */ + public function internalLinks() + { + // get the current host - to compare against for internal links + $host = parse_url($this->currentURL(), PHP_URL_HOST); + + // filter the array + return array_values(array_filter( + $this->links(), + function($link) use (&$host) { return ($host === parse_url($link, PHP_URL_HOST)); } + )); + } + + /** + * get all internal links on the page as absolute URLs + * + * @return array + */ + public function externalLinks() + { + // diff the array + return array_diff( + $this->links(), + $this->internalLinks() + ); + } + /** * get all links on the page with commonly interesting details * diff --git a/tests/LinkTest.php b/tests/LinkTest.php index 9f520cb..f8b86a0 100644 --- a/tests/LinkTest.php +++ b/tests/LinkTest.php @@ -246,4 +246,38 @@ public function testBaseHref() ] ], $web->linksWithDetails); } + + /** + * @test + */ + public function testInternalLinks() + { + $web = new \spekulatius\phpscraper(); + + // Navigate to the test page. + $web->go('https://test-pages.phpscraper.de/links/base-href.html'); + + // Check the complex links list + $this->assertSame( + ['https://test-pages.phpscraper.de/assets/cat.jpg'], + $web->internalLinks + ); + } + + /** + * @test + */ + public function testExternalLinks() + { + $web = new \spekulatius\phpscraper(); + + // Navigate to the test page. + $web->go('https://test-pages.phpscraper.de/links/base-href.html'); + + // Check the complex links list + $this->assertSame( + ['https://placekitten.com/408/287'], + $web->externalLinks + ); + } } diff --git a/websites/.vuepress/config.js b/websites/.vuepress/config.js index 8284d55..5e658e7 100644 --- a/websites/.vuepress/config.js +++ b/websites/.vuepress/config.js @@ -9,7 +9,7 @@ module.exports = { serviceWorker: true, updatePopup: true } - ] + ] ], themeConfig: { repo: 'spekulatius/phpscraper', diff --git a/websites/examples/scrape-social-media-meta-tags.md b/websites/examples/scrape-social-media-meta-tags.md index 273cd8e..3c0c6bf 100644 --- a/websites/examples/scrape-social-media-meta-tags.md +++ b/websites/examples/scrape-social-media-meta-tags.md @@ -94,4 +94,4 @@ $data = $web->twitterCard; */ ``` -In similar fashion to Open Graph, the array will be empty if not Twitter Card tags have been found. +In similar fashion to Open Graph, the array will be empty if no Twitter Card tags have been found.