diff --git a/composer.json b/composer.json
index e6068e94..6b4585aa 100644
--- a/composer.json
+++ b/composer.json
@@ -31,7 +31,8 @@
"simplepie/simplepie": "^1.5",
"smalot/pdfparser": "^1.0",
"symfony/options-resolver": "^3.4|^4.4|^5.3",
- "true/punycode": "^2.1"
+ "true/punycode": "^2.1",
+ "guzzlehttp/psr7": "^1.5.0"
},
"require-dev": {
"friendsofphp/php-cs-fixer": "^3.0",
diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php
index ba8d4db7..e1f774a6 100644
--- a/src/Extractor/HttpClient.php
+++ b/src/Extractor/HttpClient.php
@@ -263,7 +263,12 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
}
// remove utm parameters & fragment
- $effectiveUrl = preg_replace('/((\?)?(&(amp;)?)?utm_(.*?)\=[^&]+)|(#(.*?)\=[^&]+)/', '', rawurldecode($effectiveUrl));
+ $uri = new Uri(str_replace('&', '&', $effectiveUrl));
+ parse_str($uri->getQuery(), $query);
+ $queryParameters = array_filter($query, function ($k) {
+ return !(0 === stripos($k, 'utm_'));
+ }, \ARRAY_FILTER_USE_KEY);
+ $effectiveUrl = (string) Uri::withQueryValues(new Uri($uri->withFragment('')->withQuery('')), $queryParameters);
$this->logger->info('Data fetched: {data}', ['data' => [
'effective_url' => $effectiveUrl,
diff --git a/tests/Extractor/HttpClientTest.php b/tests/Extractor/HttpClientTest.php
index 3ca97dcd..5b3f7de6 100644
--- a/tests/Extractor/HttpClientTest.php
+++ b/tests/Extractor/HttpClientTest.php
@@ -620,4 +620,36 @@ public function testAccept(string $url, array $httpHeader, $expectedAccept): voi
$this->assertArrayNotHasKey('accept', $records[3]['context']);
}
}
+
+ public function dataForWithUrlContainingQueryAndFragment(): array
+ {
+ return [
+ [
+ 'url' => 'https://example.com/foo?utm_content=111315005&utm_medium=social&utm_source=twitter&hss_channel=tw-hello',
+ 'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello',
+ ],
+ [
+ 'url' => 'https://example.com/foo?hss_channel=tw-hello#fragment',
+ 'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello',
+ ],
+ [
+ 'url' => 'https://example.com/foo?utm_content=111315005',
+ 'expectedUrl' => 'https://example.com/foo',
+ ],
+ ];
+ }
+
+ /**
+ * @dataProvider dataForWithUrlContainingQueryAndFragment
+ */
+ public function testWithUrlContainingQueryAndFragment(string $url, string $expectedUrl): void
+ {
+ $httpMockClient = new HttpMockClient();
+ $httpMockClient->addResponse(new Response(200));
+
+ $http = new HttpClient($httpMockClient);
+ $res = $http->fetch($url);
+
+ $this->assertSame($expectedUrl, $res['effective_url']);
+ }
}
diff --git a/tests/GrabyFunctionalTest.php b/tests/GrabyFunctionalTest.php
index d7718d95..91a09a19 100644
--- a/tests/GrabyFunctionalTest.php
+++ b/tests/GrabyFunctionalTest.php
@@ -220,7 +220,7 @@ public function testYoutubeOembed(): void
$this->assertSame(200, $res['status']);
$this->assertEmpty($res['language']);
- $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=td0P8qrS8iI&format=xml', $res['url']);
+ $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v%3Dtd0P8qrS8iI&format=xml', $res['url']);
$this->assertSame('[Review] The Matrix Falling (Rain) Source Code C++', $res['title']);
// $this->assertSame('', $res['html']);
$this->assertSame('[embedded content]', $res['summary']);
diff --git a/tests/GrabyTest.php b/tests/GrabyTest.php
index 3c349b5f..38f0516f 100644
--- a/tests/GrabyTest.php
+++ b/tests/GrabyTest.php
@@ -403,7 +403,7 @@ public function testAssetExtensionTXT(): void
public function dataForSinglePage(): array
{
return [
- 'single_page_link will return a string (ie the text content of node)' => ['singlepage1.com', 'http://singlepage1.com/printed view', 'http://moreintelligentlife.com/print/content'],
+ 'single_page_link will return a string (ie the text content of node)' => ['singlepage1.com', 'http://singlepage1.com/printed%20view', 'http://moreintelligentlife.com/print/content'],
'single_page_link will return the a node' => ['singlepage2.com', 'http://singlepage2.com/print/content', 'http://singlepage2.com/print/content'],
'single_page_link will return the href from a node' => ['singlepage3.com', 'http://singlepage3.com/print/content', 'http://singlepage3.com/print/content'],
'single_page_link will return nothing useful' => ['singlepage4.com', 'http://singlepage4.com', 'http://singlepage4.com/print/content'],