From 58fe208f612d4d96c99d559bcb1a9390ab5606af Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 10 Nov 2022 10:57:18 +0100 Subject: [PATCH 01/24] Allow to process XML and JSON directly --- src/UsesParsers.php | 80 ++++++++++++++++-- tests/ParserTest.php | 190 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+), 8 deletions(-) create mode 100644 tests/ParserTest.php diff --git a/src/UsesParsers.php b/src/UsesParsers.php index 25fc439..018b688 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -4,25 +4,89 @@ trait UsesParsers { - public function parseJson(?string $jsonString = null): array + /** + * Parses a given JSON string or fetches the URL and parses it. + * + * @param ?string $jsonStringOrUrl + * @return array $data + */ + public function parseJson(?string $jsonStringOrUrl = null): array { - // See if we can parse the current URL already. If not, navigate to the usual URL. try { - return json_decode($jsonString, true); + // If we have a string, let's try to parse the JSON from this. + if ($jsonStringOrUrl !== null) { + // Simple: Try to parse what we have been given + try { + $result = json_decode($jsonStringOrUrl, true); + } catch (\Exception $e) { + // We don't do anything if it fails - likely we have an URL. Let's continue below. + } + } + + /** + * We fetch the content and process it, if we haven't got a JSON as a string. + * + * This is a work-around to allow for: + * + * - `$web->parseJson('https://...')`. + * - `$web->go('...')->parseJson()`. + */ + $result = $result ?? json_decode( + // Fetch the resource either using $jsonStringOrUrl + $this->fetchAsset( + // Fallback on the current URL, if needed and possible (`go` was used before). + $jsonStringOrUrl || !$this->currentPage ? $jsonStringOrUrl : $this->currentUrl() + ), + true + ); } catch (\Exception $e) { throw new \Exception('Failed to parse JSON: ' . $e->getMessage()); } + + return $result; } - public function parseXml(?string $xmlString = null): array + /** + * Parses a given XML string or fetches the URL and parses it. + * + * @param ?string $xmlStringOrUrl + * @return array $data + */ + public function parseXML(?string $xmlStringOrUrl = null): array { try { - $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); - $json = json_encode($xml); + // Try to parse the XML. If it works we have got an XML string. + if ($xmlStringOrUrl !== null) { + try { + $result = $this->parseXmlString($xmlStringOrUrl); + } catch (\Exception $e) { + // Do nothing, we just want to try it if it works. + } + } - return json_decode($json, true); + /** + * We fetch the content and process it, if we haven't got a XML as a string. + * + * This is a work-around to allow for: + * + * - `$web->parseXml('https://...')`. + * - `$web->go('...')->parseXml()`. + */ + $result = $result ?? $this->parseXmlString($this->fetchAsset( + $xmlStringOrUrl || !$this->currentPage ? $xmlStringOrUrl : $this->currentUrl() + )); } catch (\Exception $e) { throw new \Exception('Failed to parse XML: ' . $e->getMessage()); } + + return $result; + } + + protected function parseXmlString(string $xmlString): array + { + // XML parser + $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); + + return json_decode(json_encode($xml), true); } -} \ No newline at end of file +} diff --git a/tests/ParserTest.php b/tests/ParserTest.php new file mode 100644 index 0000000..f33edf7 --- /dev/null +++ b/tests/ParserTest.php @@ -0,0 +1,190 @@ +expectException(\Symfony\Component\HttpClient\Exception\ClientException::class); + $this->expectExceptionMessage('HTTP/2 404 returned for "https://phpscraper.de/broken-url"'); + + $web->fetchAsset('https://phpscraper.de/broken-url'); + } + + /** + * Test the various ways to call `parseJson()`. + * + * @test + */ + public function testDifferentJsonCalls() + { + // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). + $web = new \spekulatius\phpscraper; + + // For the reference we are using a simple JSON and parse it. + $jsonString = $web->fetchAsset('https://test-pages.phpscraper.de/index.json'); + $jsonData = json_decode($jsonString, true); + + + // Case 1: Passing in an JSON string in. + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // Parse the $jsonString directly. + (new \spekulatius\phpscraper) + ->parseJson($jsonString) + ); + + + // Case 2: `go` + `parseJson()` + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // Chained call using a JSON file as URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/index.json') + ->parseJson() + ); + + + // Case 3: `parseJson()` with absolute URL. + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // Pass the absolutely URL to `parseJson()` + (new \spekulatius\phpscraper) + ->parseJson('https://test-pages.phpscraper.de/index.json') + ); + + + // Case 4: `go` + `parseJson()` with relative URL. + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // The 'go' sets the base URL for the following relative path. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseJson('/index.json') + ); + + + // Case 5: `go` with base URL + `go` with relative URL + `parseJson()`. + // 5.1. Ensure the final URL is correct. + $this->assertSame( + 'https://test-pages.phpscraper.de/index.json', + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/index.json') + ->currentUrl() + ); + + // 5.2. Ensure the parsed JSON is correct. + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/index.json') + ->parseJson() + ); + } + + /** + * @test + */ + public function testDifferentXmlCalls() + { + // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). + $web = new \spekulatius\phpscraper; + + // For the reference we are using a simple XML and parse it. + $xmlString = $web->fetchAsset('https://test-pages.phpscraper.de/sitemap.xml'); + $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); + $xmlData = json_decode(json_encode($xml), true); + + + // Case 1: Passing in an XML string in. + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // Parse the XML string directly. + (new \spekulatius\phpscraper) + ->parseXml($xmlString) + ); + + + // Case 2: `go` + `parseXml()` + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // Chained call with XML as URL + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/sitemap.xml') + ->parseXml() + ); + + + // Case 3: `parseXml()` with absolute URL. + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // Pass the absolutely URL to `parseXml()` + (new \spekulatius\phpscraper) + ->parseXml('https://test-pages.phpscraper.de/sitemap.xml') + ); + + + // Case 4: `go` + `parseXml()` with relative URL. + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // The 'go' sets the base URL for the following relative path. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseXml('/sitemap.xml') + ); + + + // Case 5: `go` with base URL + `go` with relative URL + `parseXml()`. + // 5.1. Ensure the final URL is correct. + $this->assertSame( + 'https://test-pages.phpscraper.de/sitemap.xml', + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/sitemap.xml') + ->currentUrl() + ); + + // 5.2. Ensure the parsed JSON is correct. + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/sitemap.xml') + ->parseXml() + ); + } +} From 40c3bb18b932d3b3ac8bfab13951f4ad9c86d241 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 10 Nov 2022 11:07:43 +0100 Subject: [PATCH 02/24] Move `testNullPassingThrough` to the top --- tests/UrlTest.php | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/UrlTest.php b/tests/UrlTest.php index d46e6d3..fd2c3b9 100644 --- a/tests/UrlTest.php +++ b/tests/UrlTest.php @@ -8,6 +8,18 @@ class UrlTest extends \PHPUnit\Framework\TestCase { + /** + * If null is passed to `makeUrlAbsolute`, it should always return null. + * + * @test + */ + public function testNullPassingThrough() + { + $web = new \spekulatius\phpscraper; + + $this->assertNull($web->makeUrlAbsolute(null)); + } + /** * @test */ @@ -37,18 +49,6 @@ public function validateUriTest() ); } - /** - * If null is passed to `makeUrlAbsolute`, it should always return null. - * - * @test - */ - public function testNullPassingThrough() - { - $web = new \spekulatius\phpscraper; - - $this->assertNull($web->makeUrlAbsolute(null)); - } - /** * @test */ From a120c963b2b1cefc075965fd22d47737de5d3700 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 10 Nov 2022 11:42:15 +0100 Subject: [PATCH 03/24] Improve test coverage by ensuring baseHref is considered for `makeUrlAbsolute` --- tests/UrlTest.php | 58 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/tests/UrlTest.php b/tests/UrlTest.php index fd2c3b9..6b2ae3b 100644 --- a/tests/UrlTest.php +++ b/tests/UrlTest.php @@ -112,7 +112,63 @@ public function testMakeUrlAbsolute() } /** - * Test if passed in hosts are considered. + * Basic processing of the URLs. + * + * @test + */ + public function testMakeUrlAbsoluteConsiderBaseHref() + { + $web = new \spekulatius\phpscraper; + + /** + * Navigate to test page: This sets the base URL. + * + * It contains: + * + * ```html + * + * ``` + * + * While it's located on `test-pages.phpscraper.de`. + * + * This page isn't actually used. It's purely to set the context. + */ + $web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html'); + + // Test variations of paths to be processed + // With leading slash + $this->assertSame( + 'https://test-pages-with-base-href.phpscraper.de/index.html', + $web->makeUrlAbsolute('/index.html'), + ); + + // Without leading slash + $this->assertSame( + 'https://test-pages-with-base-href.phpscraper.de/index.html', + $web->makeUrlAbsolute('index.html'), + ); + + // Paths are considered. + $this->assertSame( + 'https://test-pages-with-base-href.phpscraper.de/test/index.html', + $web->makeUrlAbsolute('test/index.html'), + ); + + // Absolutely URLs are untouched. + $this->assertSame( + 'https://example.com/index.html', + $web->makeUrlAbsolute('https://example.com/index.html'), + ); + + // Protocol is considered + $this->assertSame( + 'http://example.com/index.html', + $web->makeUrlAbsolute('http://example.com/index.html'), + ); + } + + /** + * Test if passed in hosts are considered. It trumps any base-href and current url. * * @test */ From 6cf3489519eb29f3c7c23f54741e2147decfbbc0 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 10 Nov 2022 11:48:20 +0100 Subject: [PATCH 04/24] Improve test logic --- tests/UrlTest.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/UrlTest.php b/tests/UrlTest.php index 6b2ae3b..52c037d 100644 --- a/tests/UrlTest.php +++ b/tests/UrlTest.php @@ -201,13 +201,13 @@ public function testMakeUrlAbsoluteWithBaseHost() // Absolutely URLs are untouched. $this->assertSame( 'https://example.com/index.html', - $web->makeUrlAbsolute('https://example.com/index.html', 'https://example.com/test/with/path'), + $web->makeUrlAbsolute('https://example.com/index.html', 'https://example-2.com/test/with/path'), ); // Protocol is considered $this->assertSame( 'http://example.com/index.html', - $web->makeUrlAbsolute('http://example.com/index.html', 'https://example.com/test/with/path'), + $web->makeUrlAbsolute('http://example.com/index.html', 'https://example-2.com/test/with/path'), ); } } From ef6f13ebc007b4dd66f9fa7e6ac572442c0e8b60 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Thu, 10 Nov 2022 16:17:13 +0100 Subject: [PATCH 05/24] Tidy up core --- src/Core.php | 12 ++++++------ websites/examples/scrape-feeds.md | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Core.php b/src/Core.php index 438c16a..586255b 100644 --- a/src/Core.php +++ b/src/Core.php @@ -3,16 +3,11 @@ namespace spekulatius; /** - * This class organizes mostly. For individual functionality please check the related traits. + * This class organizes mostly. For individual functionality check the related traits please. */ class Core { - /** - * Shared simple parsers. - */ - use UsesParsers; - /** * Url related helpers. */ @@ -33,6 +28,11 @@ class Core */ use UsesContent; + /** + * Shared simple parsers for XML, JSON and CSV. + */ + use UsesParsers; + /** * This contains the feeds-related selectors and parsers. */ diff --git a/websites/examples/scrape-feeds.md b/websites/examples/scrape-feeds.md index a80571f..9d3305d 100644 --- a/websites/examples/scrape-feeds.md +++ b/websites/examples/scrape-feeds.md @@ -11,7 +11,7 @@ PHPScraper can identify and process feeds (RSS feeds, sitemaps, etc.) for you. T ## Identify RSS Feed URLs -Websites can define RSS feeds in the head section of their markup. PHPScraper allows to identify the RSS feeds of the current page using `rssUrls`: +Websites can define RSS feeds in the head section of their markup. PHPScraper allows to identify any RSS feeds of the current page using `rssUrls`: ```php $web = new \spekulatius\phpscraper; From a02e7e84001e2b9b4d1cfe47fcc61e270045935a Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Sat, 12 Nov 2022 14:58:25 +0100 Subject: [PATCH 06/24] Split test files --- src/UsesParsers.php | 8 ++ tests/ParserJsonTest.php | 92 +++++++++++++++++++ tests/ParserTest.php | 190 --------------------------------------- tests/ParserXmlTest.php | 91 +++++++++++++++++++ 4 files changed, 191 insertions(+), 190 deletions(-) create mode 100644 tests/ParserJsonTest.php delete mode 100644 tests/ParserTest.php create mode 100644 tests/ParserXmlTest.php diff --git a/src/UsesParsers.php b/src/UsesParsers.php index 018b688..e8958a7 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -82,6 +82,14 @@ public function parseXML(?string $xmlStringOrUrl = null): array return $result; } + /** + * Parses a given XML string. + * + * @source https://stackoverflow.com/a/20431742 + * + * @param string $xmlStringOrUrl + * @return array $data + */ protected function parseXmlString(string $xmlString): array { // XML parser diff --git a/tests/ParserJsonTest.php b/tests/ParserJsonTest.php new file mode 100644 index 0000000..d3392ff --- /dev/null +++ b/tests/ParserJsonTest.php @@ -0,0 +1,92 @@ +fetchAsset('https://test-pages.phpscraper.de/index.json'); + $jsonData = json_decode($jsonString, true); + + + // Case 1: Passing in an JSON string in. + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // Parse the $jsonString directly. + (new \spekulatius\phpscraper) + ->parseJson($jsonString) + ); + + + // Case 2: `go` + `parseJson()` + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // Chained call using a JSON file as URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/index.json') + ->parseJson() + ); + + + // Case 3: `parseJson()` with absolute URL. + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // Pass the absolutely URL to `parseJson()` + (new \spekulatius\phpscraper) + ->parseJson('https://test-pages.phpscraper.de/index.json') + ); + + + // Case 4: `go` + `parseJson()` with relative URL. + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // The 'go' sets the base URL for the following relative path. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseJson('/index.json') + ); + + + // Case 5: `go` with base URL + `go` with relative URL + `parseJson()`. + // 5.1. Ensure the final URL is correct. + $this->assertSame( + 'https://test-pages.phpscraper.de/index.json', + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/index.json') + ->currentUrl() + ); + + // 5.2. Ensure the parsed JSON is correct. + $this->assertSame( + // Pass the JSON Data as reference in. + $jsonData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/index.json') + ->parseJson() + ); + } +} diff --git a/tests/ParserTest.php b/tests/ParserTest.php deleted file mode 100644 index f33edf7..0000000 --- a/tests/ParserTest.php +++ /dev/null @@ -1,190 +0,0 @@ -expectException(\Symfony\Component\HttpClient\Exception\ClientException::class); - $this->expectExceptionMessage('HTTP/2 404 returned for "https://phpscraper.de/broken-url"'); - - $web->fetchAsset('https://phpscraper.de/broken-url'); - } - - /** - * Test the various ways to call `parseJson()`. - * - * @test - */ - public function testDifferentJsonCalls() - { - // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). - $web = new \spekulatius\phpscraper; - - // For the reference we are using a simple JSON and parse it. - $jsonString = $web->fetchAsset('https://test-pages.phpscraper.de/index.json'); - $jsonData = json_decode($jsonString, true); - - - // Case 1: Passing in an JSON string in. - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // Parse the $jsonString directly. - (new \spekulatius\phpscraper) - ->parseJson($jsonString) - ); - - - // Case 2: `go` + `parseJson()` - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // Chained call using a JSON file as URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/index.json') - ->parseJson() - ); - - - // Case 3: `parseJson()` with absolute URL. - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // Pass the absolutely URL to `parseJson()` - (new \spekulatius\phpscraper) - ->parseJson('https://test-pages.phpscraper.de/index.json') - ); - - - // Case 4: `go` + `parseJson()` with relative URL. - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // The 'go' sets the base URL for the following relative path. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->parseJson('/index.json') - ); - - - // Case 5: `go` with base URL + `go` with relative URL + `parseJson()`. - // 5.1. Ensure the final URL is correct. - $this->assertSame( - 'https://test-pages.phpscraper.de/index.json', - - // The first 'go' sets the base URL for the following `go` with relative URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/index.json') - ->currentUrl() - ); - - // 5.2. Ensure the parsed JSON is correct. - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // The first 'go' sets the base URL for the following `go` with relative URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/index.json') - ->parseJson() - ); - } - - /** - * @test - */ - public function testDifferentXmlCalls() - { - // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). - $web = new \spekulatius\phpscraper; - - // For the reference we are using a simple XML and parse it. - $xmlString = $web->fetchAsset('https://test-pages.phpscraper.de/sitemap.xml'); - $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); - $xmlData = json_decode(json_encode($xml), true); - - - // Case 1: Passing in an XML string in. - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // Parse the XML string directly. - (new \spekulatius\phpscraper) - ->parseXml($xmlString) - ); - - - // Case 2: `go` + `parseXml()` - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // Chained call with XML as URL - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/sitemap.xml') - ->parseXml() - ); - - - // Case 3: `parseXml()` with absolute URL. - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // Pass the absolutely URL to `parseXml()` - (new \spekulatius\phpscraper) - ->parseXml('https://test-pages.phpscraper.de/sitemap.xml') - ); - - - // Case 4: `go` + `parseXml()` with relative URL. - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // The 'go' sets the base URL for the following relative path. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->parseXml('/sitemap.xml') - ); - - - // Case 5: `go` with base URL + `go` with relative URL + `parseXml()`. - // 5.1. Ensure the final URL is correct. - $this->assertSame( - 'https://test-pages.phpscraper.de/sitemap.xml', - - // The first 'go' sets the base URL for the following `go` with relative URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/sitemap.xml') - ->currentUrl() - ); - - // 5.2. Ensure the parsed JSON is correct. - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // The first 'go' sets the base URL for the following `go` with relative URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/sitemap.xml') - ->parseXml() - ); - } -} diff --git a/tests/ParserXmlTest.php b/tests/ParserXmlTest.php new file mode 100644 index 0000000..0402046 --- /dev/null +++ b/tests/ParserXmlTest.php @@ -0,0 +1,91 @@ +fetchAsset('https://test-pages.phpscraper.de/sitemap.xml'); + $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); + $xmlData = json_decode(json_encode($xml), true); + + + // Case 1: Passing in an XML string in. + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // Parse the XML string directly. + (new \spekulatius\phpscraper) + ->parseXml($xmlString) + ); + + + // Case 2: `go` + `parseXml()` + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // Chained call with XML as URL + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/sitemap.xml') + ->parseXml() + ); + + + // Case 3: `parseXml()` with absolute URL. + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // Pass the absolutely URL to `parseXml()` + (new \spekulatius\phpscraper) + ->parseXml('https://test-pages.phpscraper.de/sitemap.xml') + ); + + + // Case 4: `go` + `parseXml()` with relative URL. + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // The 'go' sets the base URL for the following relative path. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseXml('/sitemap.xml') + ); + + + // Case 5: `go` with base URL + `go` with relative URL + `parseXml()`. + // 5.1. Ensure the final URL is correct. + $this->assertSame( + 'https://test-pages.phpscraper.de/sitemap.xml', + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/sitemap.xml') + ->currentUrl() + ); + + // 5.2. Ensure the parsed JSON is correct. + $this->assertSame( + // Pass the XML Data as reference in. + $xmlData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/sitemap.xml') + ->parseXml() + ); + } +} From d5ab34aa5fc57878bb3e87db6a83aade7de376b9 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Sat, 12 Nov 2022 16:14:30 +0100 Subject: [PATCH 07/24] wip --- src/Core.php | 10 ++-- src/UsesParsers.php | 137 ++++++++++++++++++++++++++++++++++++++++++- src/phpscraper.php | 10 ++++ tests/CoreTest.php | 45 +++++++++++--- tests/ParserTest.php | 52 ++++++++++++++++ 5 files changed, 238 insertions(+), 16 deletions(-) diff --git a/src/Core.php b/src/Core.php index 586255b..385f2b2 100644 --- a/src/Core.php +++ b/src/Core.php @@ -9,22 +9,22 @@ class Core { /** - * Url related helpers. + * Url related helpers for information about the current location and URL processing. */ use UsesUrls; /** - * This trait manages Goutte itself. + * This trait manages the interaction with Goutte. */ use UsesGoutte; /** - * This contains the basic filter methods. + * This contains the basic filter methods. Make accessing data easier. */ use UsesXPathFilters; /** - * This contains various content-related selectors. + * This contains various content-related selectors. meta tags, h1, etc. pp. */ use UsesContent; @@ -34,7 +34,7 @@ class Core use UsesParsers; /** - * This contains the feeds-related selectors and parsers. + * This contains the feeds-related selectors and parsers: RSS, sitemap, search index, etc. */ use UsesFeeds; } diff --git a/src/UsesParsers.php b/src/UsesParsers.php index 018b688..fddacb1 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -4,6 +4,137 @@ trait UsesParsers { + /** + * Parse a CSV. + * + * @return array $data + */ + public function parseCsv(): array + { + + } + + /** + * Parse a CSV. + * + * @return array $data + */ + public function parseCsv1(): array + { + + } + + /** + * Parse a CSV. + * + * @return array $data + */ + public function parseCsv2(): array + { + + } + + + /** + * Parse a CSV. + * + * @return array $data + */ + public function parseCsv3(): array + { + + } + + + + public function csvDecodeRaw() + { + try { + + // @todo implement + // string $separator = ",", + // string $enclosure = "\"", + // string $escape = "\\" + + $array = array_map('str_getcsv', explode("\n", $csvString)); + } catch (\Exception $e) { + throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); + } + + return $array; + } + + public function csvDecodeWithHeaderRaw( + string $csvString, + ?array $options = [] + ): array + { + // merge options with the configuration and the global defaults + $config = array_merge( + $this->config + + ); + + $array = []; + + try { + + // @todo implement + // string $separator = ",", + // string $enclosure = "\"", + // string $escape = "\\" + + $array = array_map('str_getcsv', explode("\n", $csvString)); + + $header = array_shift($array); + + array_walk( + $array, + function(&$row, $key, $header) { + $row = array_combine($header, $row); + }, + $header + ); + } catch (\Exception $e) { + throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); + } + + return $array; + } + + public function csvDecodeWithHeader( + string $csvString, + ?array $options = [] + ): array + { + $csv = $this->csvDecodeWithHeaderRaw($csvString, $options); + + // Cast some common types? + if ($config['enableCastTyping']) { + // + + // Custom types? Callbacks anyone? + foreach ($config['customTypes'] as $field => $callback) { + // check if the field matches. + $entry = $callback($entry); + } + } + + return $csv; + } + + + + + // // Associate + // $csv = array_map('str_getcsv', file($file)); + // array_walk($csv, function(&$a) use ($csv) { + // $a = array_combine($csv[0], $a); + // }); + // array_shift($csv); + + + /** * Parses a given JSON string or fetches the URL and parses it. * @@ -58,7 +189,7 @@ public function parseXML(?string $xmlStringOrUrl = null): array // Try to parse the XML. If it works we have got an XML string. if ($xmlStringOrUrl !== null) { try { - $result = $this->parseXmlString($xmlStringOrUrl); + $result = $this->xmlDecode($xmlStringOrUrl); } catch (\Exception $e) { // Do nothing, we just want to try it if it works. } @@ -72,7 +203,7 @@ public function parseXML(?string $xmlStringOrUrl = null): array * - `$web->parseXml('https://...')`. * - `$web->go('...')->parseXml()`. */ - $result = $result ?? $this->parseXmlString($this->fetchAsset( + $result = $result ?? $this->xmlDecode($this->fetchAsset( $xmlStringOrUrl || !$this->currentPage ? $xmlStringOrUrl : $this->currentUrl() )); } catch (\Exception $e) { @@ -82,7 +213,7 @@ public function parseXML(?string $xmlStringOrUrl = null): array return $result; } - protected function parseXmlString(string $xmlString): array + protected function xmlDecode(string $xmlString): array { // XML parser $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); diff --git a/src/phpscraper.php b/src/phpscraper.php index 2ff81be..3a419f8 100644 --- a/src/phpscraper.php +++ b/src/phpscraper.php @@ -36,6 +36,16 @@ public function __construct(?array $config = []) $this->setConfig($config); } + public static function create(?array $config = []): self + { + return new self($config); + } + + public static function go($url): self + { + return (new self())->go($url); + } + /** * Sets the config, generates the required Clients and updates the core with the new clients. * diff --git a/tests/CoreTest.php b/tests/CoreTest.php index 1ce528b..9f5bee4 100644 --- a/tests/CoreTest.php +++ b/tests/CoreTest.php @@ -62,25 +62,54 @@ public function testChangeOfCurrentPage() } /** - * Calls should be chainable. + * Calls should be chainable and easy to access. * * @test */ - public function testBasicChainability() + public function testChainability() { + // Testing env: First h1: "We are testing here & elsewhere!" + $url = 'https://test-pages.phpscraper.de/meta/html-entities.html'; + + + // Test 1: Create, navigate to the test page. $web = new \spekulatius\phpscraper; + $web->go($url); + + // Check the h1 + $this->assertSame( + 'We are testing here & elsewhere!', + $web->h1[0] + ); - // Navigate to test page - $web->go('https://phpscraper.de'); + // Test 2: Chained $this->assertSame( - // Unchained - $web->title, + 'We are testing here & elsewhere!', // Chained (new \spekulatius\phpscraper) - ->go('https://phpscraper.de') - ->title + ->go($url) + ->h1[0] + ); + + // Test 3: Static with `create` + $this->assertSame( + 'We are testing here & elsewhere!', + + // Chained + \spekulatius\phpscraper::create() + ->go($url) + ->h1[0] + ); + + // Test 4: Static with `go` + $this->assertSame( + 'We are testing here & elsewhere!', + + // Chained + \spekulatius\phpscraper::go($url) + ->h1[0] ); } } diff --git a/tests/ParserTest.php b/tests/ParserTest.php index f33edf7..d6b645f 100644 --- a/tests/ParserTest.php +++ b/tests/ParserTest.php @@ -187,4 +187,56 @@ public function testDifferentXmlCalls() ->parseXml() ); } + + + + /** + * @test + */ + public function testCsvDecodeRaw() + { + $web = new \spekulatius\phpscraper; + + $this->assertSame( + $web->csvDecode("date,value\n1945-02-06,420\n1952-03-11,42"), + [ + ['date', 'value'], + ['1945-02-06', 420], + ['1952-03-11', 42], + ] + ); + } + + /** + * @test + */ + public function testCsvDecodeWithHeaderRaw() + { + $web = new \spekulatius\phpscraper; + + $this->assertSame( + $web->csvDecodeWithHeaders("date,value\n1945-02-06,420\n1952-03-11,42"), + [ + ['date' => '1945-02-06', 'value' => 420], + ['date' => '1952-03-11', 'value' => 42], + ] + ); + } + + /** + * @test + */ + public function testCsvParserWithConfig() + { + $web = new \spekulatius\phpscraper; + + $this->assertSame( + $web->csvDecodeWithHeaders("date,value\n1945-02-06,420\n1952-03-11,42"), + [ + ['date' => '1945-02-06', 'value' => 420], + ['date' => '1952-03-11', 'value' => 42], + ] + ); + } + } From 08569ad699a8f0ad4edbca94f0984726aca291b0 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Sun, 13 Nov 2022 14:29:18 +0100 Subject: [PATCH 08/24] rm original test --- tests/ParserTest.php | 242 ------------------------------------------- 1 file changed, 242 deletions(-) delete mode 100644 tests/ParserTest.php diff --git a/tests/ParserTest.php b/tests/ParserTest.php deleted file mode 100644 index d6b645f..0000000 --- a/tests/ParserTest.php +++ /dev/null @@ -1,242 +0,0 @@ -expectException(\Symfony\Component\HttpClient\Exception\ClientException::class); - $this->expectExceptionMessage('HTTP/2 404 returned for "https://phpscraper.de/broken-url"'); - - $web->fetchAsset('https://phpscraper.de/broken-url'); - } - - /** - * Test the various ways to call `parseJson()`. - * - * @test - */ - public function testDifferentJsonCalls() - { - // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). - $web = new \spekulatius\phpscraper; - - // For the reference we are using a simple JSON and parse it. - $jsonString = $web->fetchAsset('https://test-pages.phpscraper.de/index.json'); - $jsonData = json_decode($jsonString, true); - - - // Case 1: Passing in an JSON string in. - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // Parse the $jsonString directly. - (new \spekulatius\phpscraper) - ->parseJson($jsonString) - ); - - - // Case 2: `go` + `parseJson()` - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // Chained call using a JSON file as URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/index.json') - ->parseJson() - ); - - - // Case 3: `parseJson()` with absolute URL. - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // Pass the absolutely URL to `parseJson()` - (new \spekulatius\phpscraper) - ->parseJson('https://test-pages.phpscraper.de/index.json') - ); - - - // Case 4: `go` + `parseJson()` with relative URL. - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // The 'go' sets the base URL for the following relative path. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->parseJson('/index.json') - ); - - - // Case 5: `go` with base URL + `go` with relative URL + `parseJson()`. - // 5.1. Ensure the final URL is correct. - $this->assertSame( - 'https://test-pages.phpscraper.de/index.json', - - // The first 'go' sets the base URL for the following `go` with relative URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/index.json') - ->currentUrl() - ); - - // 5.2. Ensure the parsed JSON is correct. - $this->assertSame( - // Pass the JSON Data as reference in. - $jsonData, - - // The first 'go' sets the base URL for the following `go` with relative URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/index.json') - ->parseJson() - ); - } - - /** - * @test - */ - public function testDifferentXmlCalls() - { - // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). - $web = new \spekulatius\phpscraper; - - // For the reference we are using a simple XML and parse it. - $xmlString = $web->fetchAsset('https://test-pages.phpscraper.de/sitemap.xml'); - $xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA); - $xmlData = json_decode(json_encode($xml), true); - - - // Case 1: Passing in an XML string in. - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // Parse the XML string directly. - (new \spekulatius\phpscraper) - ->parseXml($xmlString) - ); - - - // Case 2: `go` + `parseXml()` - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // Chained call with XML as URL - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/sitemap.xml') - ->parseXml() - ); - - - // Case 3: `parseXml()` with absolute URL. - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // Pass the absolutely URL to `parseXml()` - (new \spekulatius\phpscraper) - ->parseXml('https://test-pages.phpscraper.de/sitemap.xml') - ); - - - // Case 4: `go` + `parseXml()` with relative URL. - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // The 'go' sets the base URL for the following relative path. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->parseXml('/sitemap.xml') - ); - - - // Case 5: `go` with base URL + `go` with relative URL + `parseXml()`. - // 5.1. Ensure the final URL is correct. - $this->assertSame( - 'https://test-pages.phpscraper.de/sitemap.xml', - - // The first 'go' sets the base URL for the following `go` with relative URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/sitemap.xml') - ->currentUrl() - ); - - // 5.2. Ensure the parsed JSON is correct. - $this->assertSame( - // Pass the XML Data as reference in. - $xmlData, - - // The first 'go' sets the base URL for the following `go` with relative URL. - (new \spekulatius\phpscraper) - ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/sitemap.xml') - ->parseXml() - ); - } - - - - /** - * @test - */ - public function testCsvDecodeRaw() - { - $web = new \spekulatius\phpscraper; - - $this->assertSame( - $web->csvDecode("date,value\n1945-02-06,420\n1952-03-11,42"), - [ - ['date', 'value'], - ['1945-02-06', 420], - ['1952-03-11', 42], - ] - ); - } - - /** - * @test - */ - public function testCsvDecodeWithHeaderRaw() - { - $web = new \spekulatius\phpscraper; - - $this->assertSame( - $web->csvDecodeWithHeaders("date,value\n1945-02-06,420\n1952-03-11,42"), - [ - ['date' => '1945-02-06', 'value' => 420], - ['date' => '1952-03-11', 'value' => 42], - ] - ); - } - - /** - * @test - */ - public function testCsvParserWithConfig() - { - $web = new \spekulatius\phpscraper; - - $this->assertSame( - $web->csvDecodeWithHeaders("date,value\n1945-02-06,420\n1952-03-11,42"), - [ - ['date' => '1945-02-06', 'value' => 420], - ['date' => '1952-03-11', 'value' => 42], - ] - ); - } - -} From 6474ae2547d49975e502cc8521561c73cd9ed652 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 14 Nov 2022 07:15:36 +0100 Subject: [PATCH 09/24] rm static callers --- src/phpscraper.php | 10 ---------- tests/CoreTest.php | 19 ------------------- 2 files changed, 29 deletions(-) diff --git a/src/phpscraper.php b/src/phpscraper.php index 3a419f8..2ff81be 100644 --- a/src/phpscraper.php +++ b/src/phpscraper.php @@ -36,16 +36,6 @@ public function __construct(?array $config = []) $this->setConfig($config); } - public static function create(?array $config = []): self - { - return new self($config); - } - - public static function go($url): self - { - return (new self())->go($url); - } - /** * Sets the config, generates the required Clients and updates the core with the new clients. * diff --git a/tests/CoreTest.php b/tests/CoreTest.php index 9f5bee4..8446eec 100644 --- a/tests/CoreTest.php +++ b/tests/CoreTest.php @@ -92,24 +92,5 @@ public function testChainability() ->go($url) ->h1[0] ); - - // Test 3: Static with `create` - $this->assertSame( - 'We are testing here & elsewhere!', - - // Chained - \spekulatius\phpscraper::create() - ->go($url) - ->h1[0] - ); - - // Test 4: Static with `go` - $this->assertSame( - 'We are testing here & elsewhere!', - - // Chained - \spekulatius\phpscraper::go($url) - ->h1[0] - ); } } From 95baed7bcb6866fc5dd61e0115054a8dbca92d1b Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 14 Nov 2022 07:21:54 +0100 Subject: [PATCH 10/24] adding ct --- composer.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/composer.json b/composer.json index 93f8cd1..400ab3b 100644 --- a/composer.json +++ b/composer.json @@ -43,7 +43,8 @@ } }, "scripts": { - "test": "./vendor/phpunit/phpunit/phpunit --cache-result --cache-result-file=/tmp --order-by=defects --colors=always --stop-on-failure" + "test": "./vendor/phpunit/phpunit/phpunit --cache-result --cache-result-file=/tmp --order-by=defects --colors=always --stop-on-failure", + "ct": "while true; do composer run test; sleep 30; done" }, "funding": [ { From 0daf69536371ea50aec4568143eec01dab2e677d Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 14 Nov 2022 13:28:25 +0100 Subject: [PATCH 11/24] Keep original name --- tests/CoreTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CoreTest.php b/tests/CoreTest.php index 8446eec..3a7b018 100644 --- a/tests/CoreTest.php +++ b/tests/CoreTest.php @@ -66,7 +66,7 @@ public function testChangeOfCurrentPage() * * @test */ - public function testChainability() + public function testBasicChainability() { // Testing env: First h1: "We are testing here & elsewhere!" $url = 'https://test-pages.phpscraper.de/meta/html-entities.html'; From 23d4b495586f76c3b6fbf53faa945525df23f30b Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 14 Nov 2022 14:29:24 +0100 Subject: [PATCH 12/24] Implement some CSV base methods --- src/UsesParsers.php | 206 +++++++++++++++++++++++++++++++--------- tests/ParserCsvTest.php | 19 ++-- 2 files changed, 169 insertions(+), 56 deletions(-) diff --git a/src/UsesParsers.php b/src/UsesParsers.php index fddacb1..4cd4e8b 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -5,86 +5,109 @@ trait UsesParsers { /** - * Parse a CSV. + * Base Util to decode CSVs. * + * @param string $csvString + * @param ?string $separator + * @param ?string $enclosure + * @param ?string $escape * @return array $data */ - public function parseCsv(): array - { + public function csvDecodeRaw( + string $csvString, + ?string $separator = null, + ?string $enclosure = null, + ?string $escape = null, + ): array { + try { + $array = array_map( + fn($line) => str_getcsv($line, $separator, $enclosure, $escape), + explode("\n", $csvString) + ); + } catch (\Exception $e) { + throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); + } + return $array; } /** - * Parse a CSV. + * Decode CSV and cast types. * + * @param string $csvString + * @param ?string $separator + * @param ?string $enclosure + * @param ?string $escape * @return array $data */ - public function parseCsv1(): array - { + public function csvDecodeWithCasting( + string $csvString, + ?string $separator = null, + ?string $enclosure = null, + ?string $escape = null, + ): array { + try { + $array = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape); + + // Cast native and custom types + $array = array_map( + fn ($line) => array_map( + fn ($cell) => $this->castType($cell), + $line + ), + $array + ); + } catch (\Exception $e) { + throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); + } + return $array; } - /** - * Parse a CSV. - * - * @return array $data - */ - public function parseCsv2(): array + // Helper method to cast types + protected function castType(string $entry) { + if ($entry == (int) $entry) { + return (int) $entry; + } + if ($entry == (float) $entry) { + return (float) $entry; + } + + return $entry; } - /** - * Parse a CSV. - * - * @return array $data - */ - public function parseCsv3(): array - { - } - public function csvDecodeRaw() - { - try { - // @todo implement - // string $separator = ",", - // string $enclosure = "\"", - // string $escape = "\\" - $array = array_map('str_getcsv', explode("\n", $csvString)); - } catch (\Exception $e) { - throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); - } - return $array; - } + + + + + + + public function csvDecodeWithHeaderRaw( string $csvString, ?array $options = [] - ): array - { + ): array { // merge options with the configuration and the global defaults - $config = array_merge( - $this->config + // $config = array_merge( + // $this->config - ); + // ); $array = []; try { - - // @todo implement - // string $separator = ",", - // string $enclosure = "\"", - // string $escape = "\\" - - $array = array_map('str_getcsv', explode("\n", $csvString)); + $array = $this->csvDecodeRaw($csvString); $header = array_shift($array); @@ -105,8 +128,7 @@ function(&$row, $key, $header) { public function csvDecodeWithHeader( string $csvString, ?array $options = [] - ): array - { + ): array { $csv = $this->csvDecodeWithHeaderRaw($csvString, $options); // Cast some common types? @@ -126,6 +148,96 @@ public function csvDecodeWithHeader( + /** + * A boilerplate function to process the various calling options for `parseX` functions. + */ + public function parseResource( + ?string $stringOrUrl, + callable $parserFunction + ): ?array { + try { + // If we have a string, let's try to parse the resource from this. + if ($stringOrUrl !== null) { + // Simple: Try to parse what we have been given + try { + $result = $parserFunction($stringOrUrl, true); + } catch (\Exception $e) { + // We don't do anything if it fails - likely we have an URL. Let's continue below. + } + } + + /** + * We fetch the content and process it, if we haven't got a resource as a string. + * + * This is a work-around to allow for: + * + * - `$web->parseJson('https://...')`. + * - `$web->go('...')->parseJson()`. + */ + $result = $result ?? $parserFunction( + // Fetch the resource either using $stringOrUrl + $this->fetchAsset( + // Fallback on the current URL, if needed and possible (`go` was used before). + $stringOrUrl || !$this->currentPage ? $stringOrUrl : $this->currentUrl() + ), + true + ); + } catch (\Exception $e) { + throw new \Exception('Failed to parse resource: ' . $e->getMessage()); + } + + return $result; + } + + + + + + + + /** + * Parses a given CSV string or fetches the URL and parses it. + * + * @param ?string $csvStringOrUrl + * @return array $data + */ + public function parseCsv(?string $csvStringOrUrl = null): array + { + try { + // If we have a string, let's try to parse the CSV from this. + if ($csvStringOrUrl !== null) { + // Simple: Try to parse what we have been given + try { + $result = json_decode($csvStringOrUrl, true); + } catch (\Exception $e) { + // We don't do anything if it fails - likely we have an URL. Let's continue below. + } + } + + /** + * We fetch the content and process it, if we haven't got a CSV as a string. + * + * This is a work-around to allow for: + * + * - `$web->parseJson('https://...')`. + * - `$web->go('...')->parseJson()`. + */ + $result = $result ?? json_decode( + // Fetch the resource either using $csvStringOrUrl + $this->fetchAsset( + // Fallback on the current URL, if needed and possible (`go` was used before). + $csvStringOrUrl || !$this->currentPage ? $csvStringOrUrl : $this->currentUrl() + ), + true + ); + } catch (\Exception $e) { + throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); + } + + return $result; + } + + // // Associate // $csv = array_map('str_getcsv', file($file)); // array_walk($csv, function(&$a) use ($csv) { diff --git a/tests/ParserCsvTest.php b/tests/ParserCsvTest.php index 7e07e55..8d81494 100644 --- a/tests/ParserCsvTest.php +++ b/tests/ParserCsvTest.php @@ -12,28 +12,29 @@ public function testCsvDecodeRaw() $web = new \spekulatius\phpscraper; $this->assertSame( - $web->csvDecode("date,value\n1945-02-06,420\n1952-03-11,42"), [ ['date', 'value'], - ['1945-02-06', 420], - ['1952-03-11', 42], - ] + ['1945-02-06', '4.20'], + ['1952-03-11', '42'], + ], + $web->csvDecodeRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"), ); } /** * @test */ - public function testCsvDecodeWithHeaderRaw() + public function testCsvDecodeWithCasting() { $web = new \spekulatius\phpscraper; $this->assertSame( - $web->csvDecodeWithHeaders("date,value\n1945-02-06,420\n1952-03-11,42"), [ - ['date' => '1945-02-06', 'value' => 420], - ['date' => '1952-03-11', 'value' => 42], - ] + ['date', 'value'], + ['1945-02-06', 4.20], + ['1952-03-11', 42], + ], + $web->csvDecodeWithCasting("date,value\n1945-02-06,4.20\n1952-03-11,42"), ); } From 163efcf2e82a85317d03cf1651ba4ad6cfd2b2bd Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 14 Nov 2022 14:31:10 +0100 Subject: [PATCH 13/24] rm trailing slash --- src/UsesParsers.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/UsesParsers.php b/src/UsesParsers.php index 4cd4e8b..80fd1f2 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -17,7 +17,7 @@ public function csvDecodeRaw( string $csvString, ?string $separator = null, ?string $enclosure = null, - ?string $escape = null, + ?string $escape = null ): array { try { $array = array_map( @@ -44,7 +44,7 @@ public function csvDecodeWithCasting( string $csvString, ?string $separator = null, ?string $enclosure = null, - ?string $escape = null, + ?string $escape = null ): array { try { $array = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape); From b79130f8e29b9502729eaa3776b52eb49fa62525 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 14 Nov 2022 15:17:08 +0100 Subject: [PATCH 14/24] Progressing basic CSV parsers --- src/UsesParsers.php | 125 ++++++++++++++++++++-------------------- tests/ParserCsvTest.php | 50 ++++++++++++++-- 2 files changed, 106 insertions(+), 69 deletions(-) diff --git a/src/UsesParsers.php b/src/UsesParsers.php index 80fd1f2..e93e783 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -5,7 +5,7 @@ trait UsesParsers { /** - * Base Util to decode CSVs. + * Base Util to decode a CSV string. * * @param string $csvString * @param ?string $separator @@ -20,15 +20,15 @@ public function csvDecodeRaw( ?string $escape = null ): array { try { - $array = array_map( - fn($line) => str_getcsv($line, $separator, $enclosure, $escape), + $csv = array_map( + fn ($line) => str_getcsv($line, $separator, $enclosure, $escape), explode("\n", $csvString) ); } catch (\Exception $e) { throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); } - return $array; + return $csv; } /** @@ -47,105 +47,102 @@ public function csvDecodeWithCasting( ?string $escape = null ): array { try { - $array = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape); + $csv = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape); // Cast native and custom types - $array = array_map( + $csv = array_map( fn ($line) => array_map( fn ($cell) => $this->castType($cell), $line ), - $array + $csv ); } catch (\Exception $e) { throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); } - return $array; - } - - // Helper method to cast types - protected function castType(string $entry) - { - if ($entry == (int) $entry) { - return (int) $entry; - } - - if ($entry == (float) $entry) { - return (float) $entry; - } - - return $entry; + return $csv; } - - - - - - - - - - - - - - - - + /** + * Util to decode a CSV string to asso. array. + * + * @param string $csvString + * @param ?string $separator + * @param ?string $enclosure + * @param ?string $escape + * @return array $data + */ public function csvDecodeWithHeaderRaw( string $csvString, - ?array $options = [] + ?string $separator = null, + ?string $enclosure = null, + ?string $escape = null ): array { - // merge options with the configuration and the global defaults - // $config = array_merge( - // $this->config - - // ); - - $array = []; - try { - $array = $this->csvDecodeRaw($csvString); + $csv = $this->csvDecodeRaw($csvString, $separator, $enclosure, $escape); - $header = array_shift($array); + $header = array_shift($csv); + // Combine the rows with the header entry. array_walk( - $array, - function(&$row, $key, $header) { - $row = array_combine($header, $row); - }, + $csv, + function(&$row, $key, $header) { $row = array_combine($header, $row); }, $header ); } catch (\Exception $e) { throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); } - return $array; + return $csv; } + /** + * Decode a CSV string to asso. array and cast types. + * + * @param string $csvString + * @param ?string $separator + * @param ?string $enclosure + * @param ?string $escape + * @return array $data + */ public function csvDecodeWithHeader( string $csvString, - ?array $options = [] + ?string $separator = null, + ?string $enclosure = null, + ?string $escape = null ): array { - $csv = $this->csvDecodeWithHeaderRaw($csvString, $options); - - // Cast some common types? - if ($config['enableCastTyping']) { - // + try { + $csv = $this->csvDecodeWithHeaderRaw($csvString, $separator, $enclosure, $escape); - // Custom types? Callbacks anyone? - foreach ($config['customTypes'] as $field => $callback) { - // check if the field matches. - $entry = $callback($entry); + // Cast native and custom types + foreach ($csv as $idx => $row) { + foreach ($row as $key => $value) { + $csv[$idx][$key] = $this->castType($value); + } } + } catch (\Exception $e) { + throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); } return $csv; } + // Helper method to cast types + protected function castType(string $entry) + { + // Looks like an int? + if ($entry == (int) $entry) { + return (int) $entry; + } + // Looks like a float? + if ($entry == (float) $entry) { + return (float) $entry; + } + + return $entry; + } /** diff --git a/tests/ParserCsvTest.php b/tests/ParserCsvTest.php index 8d81494..bc12bf2 100644 --- a/tests/ParserCsvTest.php +++ b/tests/ParserCsvTest.php @@ -39,19 +39,59 @@ public function testCsvDecodeWithCasting() } /** + * Test with pipe as separator, enclosure and escape. + * * @test */ - public function testCsvParserWithConfig() + public function testCsvDecodeWithCastingAndCustomEncoding() { $web = new \spekulatius\phpscraper; $this->assertSame( - $web->csvDecodeWithHeaders("date,value\n1945-02-06,420\n1952-03-11,42"), [ - ['date' => '1945-02-06', 'value' => 420], - ['date' => '1952-03-11', 'value' => 42], - ] + ['date', 'value'], + ['1945-02-06', 4.20], + ['1952-03-11', 42], + ['\\'], + ], + $web->csvDecodeWithCasting( + "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"\n\\", + '|', + '"', + '\\' + ) ); } + /** + * @test + */ + public function testCsvDecodeWithHeaderRaw() + { + $web = new \spekulatius\phpscraper; + + $this->assertSame( + [ + ['date' => '1945-02-06', 'value' => '4.20'], + ['date' => '1952-03-11', 'value' => '42'], + ], + $web->csvDecodeWithHeaderRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"), + ); + } + + /** + * @test + */ + public function testCsvDecodeWithHeaderAndCasting() + { + $web = new \spekulatius\phpscraper; + + $this->assertSame( + [ + ['date' => '1945-02-06', 'value' => 4.20], + ['date' => '1952-03-11', 'value' => 42], + ], + $web->csvDecodeWithHeader("date,value\n1945-02-06,4.20\n1952-03-11,42"), + ); + } } From 7081e80ee461643d60dc588a98853a082fa5c97a Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 14 Nov 2022 15:23:46 +0100 Subject: [PATCH 15/24] Better type casting --- src/UsesParsers.php | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/UsesParsers.php b/src/UsesParsers.php index e93e783..c3b261e 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -129,15 +129,15 @@ public function csvDecodeWithHeader( } // Helper method to cast types - protected function castType(string $entry) + public function castType(string $entry) { // Looks like an int? - if ($entry == (int) $entry) { + if ($entry == (string) (int) $entry) { return (int) $entry; } // Looks like a float? - if ($entry == (float) $entry) { + if ($entry == (string) (float) $entry) { return (float) $entry; } @@ -145,6 +145,9 @@ protected function castType(string $entry) } + + + /** * A boilerplate function to process the various calling options for `parseX` functions. */ @@ -235,12 +238,27 @@ public function parseCsv(?string $csvStringOrUrl = null): array } - // // Associate - // $csv = array_map('str_getcsv', file($file)); - // array_walk($csv, function(&$a) use ($csv) { - // $a = array_combine($csv[0], $a); - // }); - // array_shift($csv); + + + + + + + + + + + + + + + + + + + + + From 719044f2989d1798fd6b09ce2e6ffb85d64b8441 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Tue, 15 Nov 2022 23:06:13 +0100 Subject: [PATCH 16/24] Progressing CSV parsers --- src/UsesParsers.php | 83 +++++---------- tests/ParserCsvTest.php | 225 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 247 insertions(+), 61 deletions(-) diff --git a/src/UsesParsers.php b/src/UsesParsers.php index c3b261e..1558316 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -24,6 +24,11 @@ public function csvDecodeRaw( fn ($line) => str_getcsv($line, $separator, $enclosure, $escape), explode("\n", $csvString) ); + + // While technically 'valid', a single string isn't overly useful and likely not actually a CSV but an URL. + if (count($csv) === 1 && count($csv[0]) === 1) { + throw new \Exception('Does not look CSV-like'); + } } catch (\Exception $e) { throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); } @@ -40,7 +45,7 @@ public function csvDecodeRaw( * @param ?string $escape * @return array $data */ - public function csvDecodeWithCasting( + public function csvDecode( string $csvString, ?string $separator = null, ?string $enclosure = null, @@ -144,71 +149,61 @@ public function castType(string $entry) return $entry; } - - - - /** - * A boilerplate function to process the various calling options for `parseX` functions. + * Parses a given CSV string or fetches the URL and parses it. + * + * @param ?string $csvStringOrUrl + * @return array $data */ - public function parseResource( - ?string $stringOrUrl, - callable $parserFunction - ): ?array { + public function parseCsv(?string $csvStringOrUrl = null): array + { try { - // If we have a string, let's try to parse the resource from this. - if ($stringOrUrl !== null) { + // If we have a string, let's try to parse the CSV from this. + if ($csvStringOrUrl !== null) { // Simple: Try to parse what we have been given try { - $result = $parserFunction($stringOrUrl, true); + $result = $this->csvDecode($csvStringOrUrl); } catch (\Exception $e) { // We don't do anything if it fails - likely we have an URL. Let's continue below. } } /** - * We fetch the content and process it, if we haven't got a resource as a string. + * We fetch the content and process it, if we haven't got a CSV as a string. * * This is a work-around to allow for: * * - `$web->parseJson('https://...')`. * - `$web->go('...')->parseJson()`. */ - $result = $result ?? $parserFunction( - // Fetch the resource either using $stringOrUrl + $result = $result ?? $this->csvDecode( + // Fetch the resource either using $csvStringOrUrl $this->fetchAsset( // Fallback on the current URL, if needed and possible (`go` was used before). - $stringOrUrl || !$this->currentPage ? $stringOrUrl : $this->currentUrl() - ), - true + $csvStringOrUrl || !$this->currentPage ? $csvStringOrUrl : $this->currentUrl() + ) ); } catch (\Exception $e) { - throw new \Exception('Failed to parse resource: ' . $e->getMessage()); + throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); } return $result; } - - - - - - /** - * Parses a given CSV string or fetches the URL and parses it. + * Parses a given CSV string into an asso. with headers or fetches the URL and parses it. * * @param ?string $csvStringOrUrl * @return array $data */ - public function parseCsv(?string $csvStringOrUrl = null): array + public function parseCsvWithHeader(?string $csvStringOrUrl = null): array { try { // If we have a string, let's try to parse the CSV from this. if ($csvStringOrUrl !== null) { // Simple: Try to parse what we have been given try { - $result = json_decode($csvStringOrUrl, true); + $result = $this->csvDecodeWithHeader($csvStringOrUrl); } catch (\Exception $e) { // We don't do anything if it fails - likely we have an URL. Let's continue below. } @@ -222,13 +217,12 @@ public function parseCsv(?string $csvStringOrUrl = null): array * - `$web->parseJson('https://...')`. * - `$web->go('...')->parseJson()`. */ - $result = $result ?? json_decode( + $result = $result ?? $this->csvDecodeWithHeader( // Fetch the resource either using $csvStringOrUrl $this->fetchAsset( // Fallback on the current URL, if needed and possible (`go` was used before). $csvStringOrUrl || !$this->currentPage ? $csvStringOrUrl : $this->currentUrl() - ), - true + ) ); } catch (\Exception $e) { throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); @@ -237,31 +231,6 @@ public function parseCsv(?string $csvStringOrUrl = null): array return $result; } - - - - - - - - - - - - - - - - - - - - - - - - - /** * Parses a given JSON string or fetches the URL and parses it. * diff --git a/tests/ParserCsvTest.php b/tests/ParserCsvTest.php index bc12bf2..f706731 100644 --- a/tests/ParserCsvTest.php +++ b/tests/ParserCsvTest.php @@ -11,6 +11,7 @@ public function testCsvDecodeRaw() { $web = new \spekulatius\phpscraper; + // Only decoding $this->assertSame( [ ['date', 'value'], @@ -19,22 +20,43 @@ public function testCsvDecodeRaw() ], $web->csvDecodeRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"), ); + + // Fetching and decoding + $this->assertSame( + [ + ['date', 'value'], + ['1945-02-06', '4.20'], + ['1952-03-11', '42'], + ], + $web->csvDecodeRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')), + ); } /** * @test */ - public function testCsvDecodeWithCasting() + public function testCsvDecode() { $web = new \spekulatius\phpscraper; + // Only decoding + $this->assertSame( + [ + ['date', 'value'], + ['1945-02-06', 4.20], + ['1952-03-11', 42], + ], + $web->csvDecode("date,value\n1945-02-06,4.20\n1952-03-11,42"), + ); + + // Fetching and decoding $this->assertSame( [ ['date', 'value'], ['1945-02-06', 4.20], ['1952-03-11', 42], ], - $web->csvDecodeWithCasting("date,value\n1945-02-06,4.20\n1952-03-11,42"), + $web->csvDecode($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')), ); } @@ -43,7 +65,7 @@ public function testCsvDecodeWithCasting() * * @test */ - public function testCsvDecodeWithCastingAndCustomEncoding() + public function testCsvDecodeAndCustomEncoding() { $web = new \spekulatius\phpscraper; @@ -54,7 +76,7 @@ public function testCsvDecodeWithCastingAndCustomEncoding() ['1952-03-11', 42], ['\\'], ], - $web->csvDecodeWithCasting( + $web->csvDecode( "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"\n\\", '|', '"', @@ -70,6 +92,7 @@ public function testCsvDecodeWithHeaderRaw() { $web = new \spekulatius\phpscraper; + // Only decoding $this->assertSame( [ ['date' => '1945-02-06', 'value' => '4.20'], @@ -77,6 +100,15 @@ public function testCsvDecodeWithHeaderRaw() ], $web->csvDecodeWithHeaderRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"), ); + + // Fetching and decoding + $this->assertSame( + [ + ['date' => '1945-02-06', 'value' => '4.20'], + ['date' => '1952-03-11', 'value' => '42'], + ], + $web->csvDecodeWithHeaderRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')), + ); } /** @@ -94,4 +126,189 @@ public function testCsvDecodeWithHeaderAndCasting() $web->csvDecodeWithHeader("date,value\n1945-02-06,4.20\n1952-03-11,42"), ); } + + /** + * Check the pluming: Test the various ways to call `parseCsv()`. + * + * @test + */ + public function testDifferentCsvCalls() + { + // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). + $web = new \spekulatius\phpscraper; + + // For the reference we are using a simple CSV and parse it. This matches the hosted CSV. + $csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42"; + $csvData = [['date', 'value'], ['1945-02-06', 4.20], ['1952-03-11', 42]]; + + + // Case 1: Passing in an CSV string in. + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // Parse the $csvString directly. + (new \spekulatius\phpscraper) + ->parseCsv($csvString) + ); + + + // Case 2: `go` + `parseCsv()` + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // Chained call using a CSV file as URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/test.csv') + ->parseCsv() + ); + + + // Case 3: `parseCsv()` with absolute URL. + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // Pass the absolutely URL to `parseCsv()` + (new \spekulatius\phpscraper) + ->parseCsv('https://test-pages.phpscraper.de/test.csv') + ); + + + // Case 4: `go` + `parseCsv()` with relative URL. + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // The 'go' sets the base URL for the following relative path. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseCsv('/test.csv') + ); + + + // Case 5: `go` with base URL + `go` with relative URL + `parseCsv()`. + // 5.1. Ensure the final URL is correct. + $this->assertSame( + 'https://test-pages.phpscraper.de/test.csv', + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/test.csv') + ->currentUrl() + ); + + // 5.2. Ensure the parsed CSV is correct. + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/test.csv') + ->parseCsv() + ); + } + + /** + * Check the pluming: Test the various ways to call `parseCsvWithHeader()`. + * + * @test + */ + public function testDifferentCsvWithHeaderCalls() + { + // Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output). + $web = new \spekulatius\phpscraper; + + // For the reference we are using a simple CSV and parse it. This matches the hosted CSV. + $csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42"; + $csvData = [ + ['date' => '1945-02-06', 'value' => 4.20], + ['date' => '1952-03-11', 'value' => 42], + ]; + + // Case 1: Passing in an CSV string in. + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // Parse the $csvString directly. + (new \spekulatius\phpscraper) + ->parseCsvWithHeader($csvString) + ); + + + // Case 2: `parseCsvWithHeader()` + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // Chained call using a CSV file as URL. + (new \spekulatius\phpscraper) + ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv') + ); + + + // Case 2: `go` + `parseCsvWithHeader()` + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // Chained call using a CSV file as URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/test.csv') + ->parseCsvWithHeader() + ); + + + // Case 3: `parseCsvWithHeader()` with absolute URL. + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // Pass the absolutely URL to `parseCsvWithHeader()` + (new \spekulatius\phpscraper) + ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv') + ); + + + // Case 4: `go` + `parseCsvWithHeader()` with relative URL. + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // The 'go' sets the base URL for the following relative path. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseCsvWithHeader('/test.csv') + ); + + + // Case 5: `go` with base URL + `go` with relative URL + `parseCsvWithHeader()`. + // 5.1. Ensure the final URL is correct. + $this->assertSame( + 'https://test-pages.phpscraper.de/test.csv', + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/test.csv') + ->currentUrl() + ); + + // 5.2. Ensure the parsed CSV is correct. + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/test.csv') + ->parseCsvWithHeader() + ); + } } From 26b18edeb1568f6868824af66390b7fa70f14408 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Sun, 20 Nov 2022 11:46:09 +0100 Subject: [PATCH 17/24] Adding documentation page for file parsing --- websites/.vuepress/config.theme.en.js | 1 + .../examples/parse-csv-json-and-xml-files.md | 145 ++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 websites/examples/parse-csv-json-and-xml-files.md diff --git a/websites/.vuepress/config.theme.en.js b/websites/.vuepress/config.theme.en.js index 46ce525..3d6fe25 100644 --- a/websites/.vuepress/config.theme.en.js +++ b/websites/.vuepress/config.theme.en.js @@ -20,6 +20,7 @@ module.exports = { 'examples/scrape-meta-tags', 'examples/scrape-social-media-meta-tags', 'examples/scrape-feeds', + 'examples/parse-csv-json-and-xml-files', ], }, { diff --git a/websites/examples/parse-csv-json-and-xml-files.md b/websites/examples/parse-csv-json-and-xml-files.md new file mode 100644 index 0000000..461d70d --- /dev/null +++ b/websites/examples/parse-csv-json-and-xml-files.md @@ -0,0 +1,145 @@ +--- +image: https://api.imageee.com/bold?text=PHP:%20Scrape%20Common%20File%20Types&bg_image=https://images.unsplash.com/photo-1542762933-ab3502717ce7 +--- + +# Scrape CSV-, XML-, and JSON-URLs/Files + +PHPScraper can process common plain file types such as `csv`, `json`, `xml` from strings or URLs for you. Most functionality described below works for all types. Special cases are noted. The following topics are covered: + +[[toc]] + + +## Parsing of CSV/XML/JSON strings + +If you have a string that represents a CSV, XML or JSON, PHPScraper can assist in validating and parsing it: + +```php +$web = new \spekulatius\phpscraper; + +// Parse a JSON string +$json = $web->parseJson($jsonString); + +// Parse an XML string +$xml = $web->parseXml($xmlString); + +// Parse a CSV string +$csv = $web->parseCsv($csvString); +``` + +This can be useful when chaining steps or accessing embedded elements such as schema-data. + + +## Fetching and Parsing of CSV/XML/JSON URLs + +PHPScraper can assist with fetching and parsing the contents of an remote resources (URLs) containing JSON-, CSV- or XML-data: + +```php +$web = new \spekulatius\phpscraper; + +// Fetches URL and parses contents to JSON. +$json = $web + ->parseJson('https://test-pages.phpscraper.de/index.json'); + +// Fetches URL and parses contents to XML. +$xml = $web + ->parseXml('https://test-pages.phpscraper.de/sitemap.xml'); + +// Fetches URL and parses contents into simple array. +$csv = $web + ->parseCsv('https://test-pages.phpscraper.de/test.csv'); + +// Fetches URL and generates an asso. array with the first line as keys. +$csv = $web + ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv'); +``` + +Each of the methods above can be accessed in various ways. Using `parseCsv` as an example, you can use any of the methods as following: + +```php +$web = new \spekulatius\phpscraper; + +// Option 1: Pass in the absolute URL +$csv = $web + ->parseCsv('https://test-pages.phpscraper.de/test.csv'); + +// Option 2: Navigate to relative URL for parsing. +$csv = $web + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseCsv('/test.csv'); + +// Option 3: Navigate with `go` or `clickLink` and call parser. +$csv = $web + ->go('https://test-pages.phpscraper.de/test.csv') + ->parseCsv(); +``` + +::: notice Multiple Methods +The examples above apply for the following methods: + +- `parseJson` +- `parseXml` +- `parseCsv` +- `parseCsvWithHeader` (resolves into an asso. array) +::: + +## Parsing a CSV string with Headers + +CSV can be parsed into various data structures. PHPScraper comes with two options built-in to parse CSV. Given the following example file: + +```bash +$ curl https://test-pages.phpscraper.de/test.csv + +date,value +1945-02-06,4.20 +1952-03-11,42 +``` + +The standard parser `parseCsv` returns a simple array with casted values: + +```php +$web = new \spekulatius\phpscraper; + +print_r( + $web->parseCsv('https://test-pages.phpscraper.de/test.csv') +); +/** + * [ + * ['date', 'value'], + * ['1945-02-06', 4.20], + * ['1952-03-11', 42], + * ] + */ +``` + +`parseCsvWithHeader` parses the content and uses the first line as headers and returns an associative array: + +```php +$web = new \spekulatius\phpscraper; + +print_r( + $web->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv') +); + +/** + * [ + * ['date' => '1945-02-06', 'value' => 4.20], + * ['date' => '1952-03-11', 'value' => 42], + * ] + */ +``` + +::: tip Type Casting +Native types such as `int` and `float` are automatically casted to PHP-native types. +::: + +## Providing CSV Parsing Parameters + +You might want to define which *separate*, *enclosure* and *escape* to use. You can do so by passing an options array along: + +```php +$web = new \spekulatius\phpscraper; + +$csv = $web + ->go('https://test-pages.phpscraper.de/test.csv') + ->parseCsv(); +``` From 399da3f5dc189d4c8bf4f25b675442d7943b9cc6 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Sun, 20 Nov 2022 13:01:36 +0100 Subject: [PATCH 18/24] Undo to keep PRs more specific --- websites/examples/scrape-feeds.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/websites/examples/scrape-feeds.md b/websites/examples/scrape-feeds.md index 9d3305d..a80571f 100644 --- a/websites/examples/scrape-feeds.md +++ b/websites/examples/scrape-feeds.md @@ -11,7 +11,7 @@ PHPScraper can identify and process feeds (RSS feeds, sitemaps, etc.) for you. T ## Identify RSS Feed URLs -Websites can define RSS feeds in the head section of their markup. PHPScraper allows to identify any RSS feeds of the current page using `rssUrls`: +Websites can define RSS feeds in the head section of their markup. PHPScraper allows to identify the RSS feeds of the current page using `rssUrls`: ```php $web = new \spekulatius\phpscraper; From d69dadcf4416c87624880d28da61d632b27669e9 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 21 Nov 2022 17:57:09 +0100 Subject: [PATCH 19/24] Tidy up doc page --- websites/examples/parse-csv-json-and-xml-files.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/websites/examples/parse-csv-json-and-xml-files.md b/websites/examples/parse-csv-json-and-xml-files.md index 461d70d..b0bdbbf 100644 --- a/websites/examples/parse-csv-json-and-xml-files.md +++ b/websites/examples/parse-csv-json-and-xml-files.md @@ -31,7 +31,7 @@ This can be useful when chaining steps or accessing embedded elements such as sc ## Fetching and Parsing of CSV/XML/JSON URLs -PHPScraper can assist with fetching and parsing the contents of an remote resources (URLs) containing JSON-, CSV- or XML-data: +PHPScraper can assist with fetching and parsing the contents of remote resources (URLs) containing JSON-, CSV- or XML data: ```php $web = new \spekulatius\phpscraper; @@ -44,11 +44,11 @@ $json = $web $xml = $web ->parseXml('https://test-pages.phpscraper.de/sitemap.xml'); -// Fetches URL and parses contents into simple array. +// Fetches URL and parses contents into a simple array. $csv = $web ->parseCsv('https://test-pages.phpscraper.de/test.csv'); -// Fetches URL and generates an asso. array with the first line as keys. +// Fetches URL and generates an asso. array (map) with the first line as keys. $csv = $web ->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv'); ``` @@ -62,12 +62,12 @@ $web = new \spekulatius\phpscraper; $csv = $web ->parseCsv('https://test-pages.phpscraper.de/test.csv'); -// Option 2: Navigate to relative URL for parsing. +// Option 2: Navigate to a relative URL for parsing. $csv = $web ->go('https://test-pages.phpscraper.de/meta/feeds.html') ->parseCsv('/test.csv'); -// Option 3: Navigate with `go` or `clickLink` and call parser. +// Option 3: Navigate with `go` or `clickLink` and call the parser. $csv = $web ->go('https://test-pages.phpscraper.de/test.csv') ->parseCsv(); @@ -129,12 +129,12 @@ print_r( ``` ::: tip Type Casting -Native types such as `int` and `float` are automatically casted to PHP-native types. +Native types such as `int` and `float` are automatically cast to PHP-native types. ::: ## Providing CSV Parsing Parameters -You might want to define which *separate*, *enclosure* and *escape* to use. You can do so by passing an options array along: +You might want to define which *separate*, *enclosure*, and *escape* to use. You can do so by passing an options array along: ```php $web = new \spekulatius\phpscraper; From cb600bc6a32d8726f251a4d3bb0d90cfcd317a68 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 21 Nov 2022 21:04:39 +0100 Subject: [PATCH 20/24] Add separator, enclosure & escape support, tests and doc --- src/UsesParsers.php | 40 ++++++++--- tests/ParserCsvTest.php | 70 +++++++++++++++++++ .../examples/parse-csv-json-and-xml-files.md | 7 +- 3 files changed, 106 insertions(+), 11 deletions(-) diff --git a/src/UsesParsers.php b/src/UsesParsers.php index 1558316..98b4bc3 100644 --- a/src/UsesParsers.php +++ b/src/UsesParsers.php @@ -153,16 +153,23 @@ public function castType(string $entry) * Parses a given CSV string or fetches the URL and parses it. * * @param ?string $csvStringOrUrl + * @param ?string $separator + * @param ?string $enclosure + * @param ?string $escape * @return array $data */ - public function parseCsv(?string $csvStringOrUrl = null): array - { + public function parseCsv( + ?string $csvStringOrUrl = null, + ?string $separator = null, + ?string $enclosure = null, + ?string $escape = null + ): array { try { // If we have a string, let's try to parse the CSV from this. if ($csvStringOrUrl !== null) { // Simple: Try to parse what we have been given try { - $result = $this->csvDecode($csvStringOrUrl); + $result = $this->csvDecode($csvStringOrUrl, $separator, $enclosure, $escape); } catch (\Exception $e) { // We don't do anything if it fails - likely we have an URL. Let's continue below. } @@ -173,15 +180,18 @@ public function parseCsv(?string $csvStringOrUrl = null): array * * This is a work-around to allow for: * - * - `$web->parseJson('https://...')`. - * - `$web->go('...')->parseJson()`. + * - `$web->parseCsv('https://...')`. + * - `$web->go('...')->parseCsv()`. */ $result = $result ?? $this->csvDecode( // Fetch the resource either using $csvStringOrUrl $this->fetchAsset( // Fallback on the current URL, if needed and possible (`go` was used before). $csvStringOrUrl || !$this->currentPage ? $csvStringOrUrl : $this->currentUrl() - ) + ), + $separator, + $enclosure, + $escape ); } catch (\Exception $e) { throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); @@ -194,16 +204,23 @@ public function parseCsv(?string $csvStringOrUrl = null): array * Parses a given CSV string into an asso. with headers or fetches the URL and parses it. * * @param ?string $csvStringOrUrl + * @param ?string $separator + * @param ?string $enclosure + * @param ?string $escape * @return array $data */ - public function parseCsvWithHeader(?string $csvStringOrUrl = null): array - { + public function parseCsvWithHeader( + ?string $csvStringOrUrl = null, + ?string $separator = null, + ?string $enclosure = null, + ?string $escape = null + ): array { try { // If we have a string, let's try to parse the CSV from this. if ($csvStringOrUrl !== null) { // Simple: Try to parse what we have been given try { - $result = $this->csvDecodeWithHeader($csvStringOrUrl); + $result = $this->csvDecodeWithHeader($csvStringOrUrl, $separator, $enclosure, $escape); } catch (\Exception $e) { // We don't do anything if it fails - likely we have an URL. Let's continue below. } @@ -222,7 +239,10 @@ public function parseCsvWithHeader(?string $csvStringOrUrl = null): array $this->fetchAsset( // Fallback on the current URL, if needed and possible (`go` was used before). $csvStringOrUrl || !$this->currentPage ? $csvStringOrUrl : $this->currentUrl() - ) + ), + $separator, + $enclosure, + $escape ); } catch (\Exception $e) { throw new \Exception('Failed to parse CSV: ' . $e->getMessage()); diff --git a/tests/ParserCsvTest.php b/tests/ParserCsvTest.php index f706731..7c51d73 100644 --- a/tests/ParserCsvTest.php +++ b/tests/ParserCsvTest.php @@ -127,6 +127,30 @@ public function testCsvDecodeWithHeaderAndCasting() ); } + /** + * Test with header, pipe as separator, and enclosure. + * + * @test + */ + public function testCsvDecodeWithHeaderAndCustomEncoding() + { + $web = new \spekulatius\phpscraper; + + $this->assertSame( + [ + ['date' => '1945-02-06', 'value' => 4.20], + ['date' => '1952-03-11', 'value' => 42], + ], + + $web->csvDecodeWithHeader( + "\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"", + '|', + '"', + '\\' + ) + ); + } + /** * Check the pluming: Test the various ways to call `parseCsv()`. * @@ -211,6 +235,29 @@ public function testDifferentCsvCalls() ->go('/test.csv') ->parseCsv() ); + + // Case 6: With encoding params + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/test-encoded.csv') + ->parseCsv(null, '|', '"') + ); + + // Case 7: With encoding params and (relative) URL + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseCsv('/test-encoded.csv', '|', '"') + ); } /** @@ -310,5 +357,28 @@ public function testDifferentCsvWithHeaderCalls() ->go('/test.csv') ->parseCsvWithHeader() ); + + // Case 6: With encoding params + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->go('/test-encoded.csv') + ->parseCsvWithHeader(null, '|', '"') + ); + + // Case 7: With encoding params and (relative) URL + $this->assertSame( + // Pass the CSV Data as reference in. + $csvData, + + // The first 'go' sets the base URL for the following `go` with relative URL. + (new \spekulatius\phpscraper) + ->go('https://test-pages.phpscraper.de/meta/feeds.html') + ->parseCsvWithHeader('/test-encoded.csv', '|', '"') + ); } } diff --git a/websites/examples/parse-csv-json-and-xml-files.md b/websites/examples/parse-csv-json-and-xml-files.md index b0bdbbf..bb63e55 100644 --- a/websites/examples/parse-csv-json-and-xml-files.md +++ b/websites/examples/parse-csv-json-and-xml-files.md @@ -139,7 +139,12 @@ You might want to define which *separate*, *enclosure*, and *escape* to use. You ```php $web = new \spekulatius\phpscraper; +// Direct access: +$csv = $web + ->parseCsv('https://test-pages.phpscraper.de/test-custom.csv', '|', '"'); + +// Alternative syntax using `go` first: $csv = $web ->go('https://test-pages.phpscraper.de/test.csv') - ->parseCsv(); + ->parseCsv(null, '|', '"'); ``` From 3705d1abb6eb3b614a3aefd89cc6be517b156afa Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 21 Nov 2022 21:08:06 +0100 Subject: [PATCH 21/24] Fix filename --- tests/ParserCsvTest.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ParserCsvTest.php b/tests/ParserCsvTest.php index 7c51d73..e262cc3 100644 --- a/tests/ParserCsvTest.php +++ b/tests/ParserCsvTest.php @@ -244,7 +244,7 @@ public function testDifferentCsvCalls() // The first 'go' sets the base URL for the following `go` with relative URL. (new \spekulatius\phpscraper) ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/test-encoded.csv') + ->go('/test-custom.csv') ->parseCsv(null, '|', '"') ); @@ -256,7 +256,7 @@ public function testDifferentCsvCalls() // The first 'go' sets the base URL for the following `go` with relative URL. (new \spekulatius\phpscraper) ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->parseCsv('/test-encoded.csv', '|', '"') + ->parseCsv('/test-custom.csv', '|', '"') ); } @@ -366,7 +366,7 @@ public function testDifferentCsvWithHeaderCalls() // The first 'go' sets the base URL for the following `go` with relative URL. (new \spekulatius\phpscraper) ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->go('/test-encoded.csv') + ->go('/test-custom.csv') ->parseCsvWithHeader(null, '|', '"') ); @@ -378,7 +378,7 @@ public function testDifferentCsvWithHeaderCalls() // The first 'go' sets the base URL for the following `go` with relative URL. (new \spekulatius\phpscraper) ->go('https://test-pages.phpscraper.de/meta/feeds.html') - ->parseCsvWithHeader('/test-encoded.csv', '|', '"') + ->parseCsvWithHeader('/test-custom.csv', '|', '"') ); } } From 9bcf8fbfb22882cf9e6c2d5cd9912be0a886477a Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 21 Nov 2022 21:18:12 +0100 Subject: [PATCH 22/24] Tweak docu --- websites/examples/parse-csv-json-and-xml-files.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/websites/examples/parse-csv-json-and-xml-files.md b/websites/examples/parse-csv-json-and-xml-files.md index bb63e55..a2c4094 100644 --- a/websites/examples/parse-csv-json-and-xml-files.md +++ b/websites/examples/parse-csv-json-and-xml-files.md @@ -2,7 +2,7 @@ image: https://api.imageee.com/bold?text=PHP:%20Scrape%20Common%20File%20Types&bg_image=https://images.unsplash.com/photo-1542762933-ab3502717ce7 --- -# Scrape CSV-, XML-, and JSON-URLs/Files +# Scrape CSV-, XML-, and JSON PHPScraper can process common plain file types such as `csv`, `json`, `xml` from strings or URLs for you. Most functionality described below works for all types. Special cases are noted. The following topics are covered: @@ -26,7 +26,7 @@ $xml = $web->parseXml($xmlString); $csv = $web->parseCsv($csvString); ``` -This can be useful when chaining steps or accessing embedded elements such as schema-data. +This can be useful when chaining steps or accessing embedded elements such as schema data. ## Fetching and Parsing of CSV/XML/JSON URLs @@ -74,7 +74,7 @@ $csv = $web ``` ::: notice Multiple Methods -The examples above apply for the following methods: +The examples above apply to the following methods: - `parseJson` - `parseXml` @@ -82,7 +82,7 @@ The examples above apply for the following methods: - `parseCsvWithHeader` (resolves into an asso. array) ::: -## Parsing a CSV string with Headers +## Parsing a CSV String with Headers CSV can be parsed into various data structures. PHPScraper comes with two options built-in to parse CSV. Given the following example file: From 1c4b2b25448d6dfb4b85fa74bc4805b18b78be4a Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 21 Nov 2022 21:21:56 +0100 Subject: [PATCH 23/24] Improve title --- websites/examples/parse-csv-json-and-xml-files.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/websites/examples/parse-csv-json-and-xml-files.md b/websites/examples/parse-csv-json-and-xml-files.md index a2c4094..bcf0f8b 100644 --- a/websites/examples/parse-csv-json-and-xml-files.md +++ b/websites/examples/parse-csv-json-and-xml-files.md @@ -2,7 +2,7 @@ image: https://api.imageee.com/bold?text=PHP:%20Scrape%20Common%20File%20Types&bg_image=https://images.unsplash.com/photo-1542762933-ab3502717ce7 --- -# Scrape CSV-, XML-, and JSON +# Scrape CSV, XML and JSON PHPScraper can process common plain file types such as `csv`, `json`, `xml` from strings or URLs for you. Most functionality described below works for all types. Special cases are noted. The following topics are covered: From 885ed1469a347203c4dba3c18a145fd2b097b92a Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Mon, 21 Nov 2022 21:24:29 +0100 Subject: [PATCH 24/24] Final tweaks on documentation page --- websites/examples/parse-csv-json-and-xml-files.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/websites/examples/parse-csv-json-and-xml-files.md b/websites/examples/parse-csv-json-and-xml-files.md index bcf0f8b..4cd6175 100644 --- a/websites/examples/parse-csv-json-and-xml-files.md +++ b/websites/examples/parse-csv-json-and-xml-files.md @@ -4,7 +4,7 @@ image: https://api.imageee.com/bold?text=PHP:%20Scrape%20Common%20File%20Types&b # Scrape CSV, XML and JSON -PHPScraper can process common plain file types such as `csv`, `json`, `xml` from strings or URLs for you. Most functionality described below works for all types. Special cases are noted. The following topics are covered: +PHPScraper can process common plain file types such as `csv`, `json`, `xml` from strings or URLs for you. Most functionality described below works for all three types. Special cases are noted. The following topics are covered: [[toc]] @@ -73,7 +73,7 @@ $csv = $web ->parseCsv(); ``` -::: notice Multiple Methods +::: tip Multiple Methods The examples above apply to the following methods: - `parseJson` @@ -111,7 +111,7 @@ print_r( */ ``` -`parseCsvWithHeader` parses the content and uses the first line as headers and returns an associative array: +`parseCsvWithHeader` parses the content and uses the first line as headers and returns an associative array (map): ```php $web = new \spekulatius\phpscraper;