diff --git a/composer.json b/composer.json index e6068e94..6b4585aa 100644 --- a/composer.json +++ b/composer.json @@ -31,7 +31,8 @@ "simplepie/simplepie": "^1.5", "smalot/pdfparser": "^1.0", "symfony/options-resolver": "^3.4|^4.4|^5.3", - "true/punycode": "^2.1" + "true/punycode": "^2.1", + "guzzlehttp/psr7": "^1.5.0" }, "require-dev": { "friendsofphp/php-cs-fixer": "^3.0", diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php index ba8d4db7..e1f774a6 100644 --- a/src/Extractor/HttpClient.php +++ b/src/Extractor/HttpClient.php @@ -263,7 +263,12 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = []) } // remove utm parameters & fragment - $effectiveUrl = preg_replace('/((\?)?(&(amp;)?)?utm_(.*?)\=[^&]+)|(#(.*?)\=[^&]+)/', '', rawurldecode($effectiveUrl)); + $uri = new Uri(str_replace('&', '&', $effectiveUrl)); + parse_str($uri->getQuery(), $query); + $queryParameters = array_filter($query, function ($k) { + return !(0 === stripos($k, 'utm_')); + }, \ARRAY_FILTER_USE_KEY); + $effectiveUrl = (string) Uri::withQueryValues(new Uri($uri->withFragment('')->withQuery('')), $queryParameters); $this->logger->info('Data fetched: {data}', ['data' => [ 'effective_url' => $effectiveUrl, diff --git a/tests/Extractor/HttpClientTest.php b/tests/Extractor/HttpClientTest.php index 3ca97dcd..5b3f7de6 100644 --- a/tests/Extractor/HttpClientTest.php +++ b/tests/Extractor/HttpClientTest.php @@ -620,4 +620,36 @@ public function testAccept(string $url, array $httpHeader, $expectedAccept): voi $this->assertArrayNotHasKey('accept', $records[3]['context']); } } + + public function dataForWithUrlContainingQueryAndFragment(): array + { + return [ + [ + 'url' => 'https://example.com/foo?utm_content=111315005&utm_medium=social&utm_source=twitter&hss_channel=tw-hello', + 'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello', + ], + [ + 'url' => 'https://example.com/foo?hss_channel=tw-hello#fragment', + 'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello', + ], + [ + 'url' => 'https://example.com/foo?utm_content=111315005', + 'expectedUrl' => 'https://example.com/foo', + ], + ]; + } + + /** + * @dataProvider dataForWithUrlContainingQueryAndFragment + */ + public function testWithUrlContainingQueryAndFragment(string $url, string $expectedUrl): void + { + $httpMockClient = new HttpMockClient(); + $httpMockClient->addResponse(new Response(200)); + + $http = new HttpClient($httpMockClient); + $res = $http->fetch($url); + + $this->assertSame($expectedUrl, $res['effective_url']); + } } diff --git a/tests/GrabyFunctionalTest.php b/tests/GrabyFunctionalTest.php index d7718d95..91a09a19 100644 --- a/tests/GrabyFunctionalTest.php +++ b/tests/GrabyFunctionalTest.php @@ -220,7 +220,7 @@ public function testYoutubeOembed(): void $this->assertSame(200, $res['status']); $this->assertEmpty($res['language']); - $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=td0P8qrS8iI&format=xml', $res['url']); + $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v%3Dtd0P8qrS8iI&format=xml', $res['url']); $this->assertSame('[Review] The Matrix Falling (Rain) Source Code C++', $res['title']); // $this->assertSame('', $res['html']); $this->assertSame('[embedded content]', $res['summary']); diff --git a/tests/GrabyTest.php b/tests/GrabyTest.php index 3c349b5f..38f0516f 100644 --- a/tests/GrabyTest.php +++ b/tests/GrabyTest.php @@ -403,7 +403,7 @@ public function testAssetExtensionTXT(): void public function dataForSinglePage(): array { return [ - 'single_page_link will return a string (ie the text content of node)' => ['singlepage1.com', 'http://singlepage1.com/printed view', 'http://moreintelligentlife.com/print/content'], + 'single_page_link will return a string (ie the text content of node)' => ['singlepage1.com', 'http://singlepage1.com/printed%20view', 'http://moreintelligentlife.com/print/content'], 'single_page_link will return the a node' => ['singlepage2.com', 'http://singlepage2.com/print/content', 'http://singlepage2.com/print/content'], 'single_page_link will return the href from a node' => ['singlepage3.com', 'http://singlepage3.com/print/content', 'http://singlepage3.com/print/content'], 'single_page_link will return nothing useful' => ['singlepage4.com', 'http://singlepage4.com', 'http://singlepage4.com/print/content'],