From f50a83f5201c36499487e214a4293cae1f1f9323 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Mon, 29 Nov 2021 22:44:57 +0100 Subject: [PATCH 1/8] Type everything Also: - test PHP 8.1 - update to PHPStan@1 - allow symfony/* 6.0 --- .github/workflows/coding-standards.yml | 2 +- .github/workflows/continuous-integration.yml | 1 + composer.json | 12 +- phpstan.neon | 22 +++- src/Extractor/ContentExtractor.php | 126 +++++++++---------- src/Extractor/HttpClient.php | 38 ++---- src/Graby.php | 94 +++++--------- src/HttpClient/Plugin/CookiePlugin.php | 2 +- src/HttpClient/Plugin/History.php | 14 +-- src/Monolog/Handler/GrabyHandler.php | 13 +- src/SiteConfig/ConfigBuilder.php | 31 ++--- tests/Extractor/ContentExtractorTest.php | 66 +++++----- 12 files changed, 186 insertions(+), 235 deletions(-) diff --git a/.github/workflows/coding-standards.yml b/.github/workflows/coding-standards.yml index 5d8714c4..6718bb3e 100644 --- a/.github/workflows/coding-standards.yml +++ b/.github/workflows/coding-standards.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: php: - - "7.3" + - "7.4" steps: - name: "Checkout" diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml index a59ecd7c..0310e04e 100644 --- a/.github/workflows/continuous-integration.yml +++ b/.github/workflows/continuous-integration.yml @@ -24,6 +24,7 @@ jobs: - "7.3" - "7.4" - "8.0" + - "8.1" steps: - name: "Checkout" diff --git a/composer.json b/composer.json index e6068e94..d3dcaae8 100644 --- a/composer.json +++ b/composer.json @@ -29,8 +29,8 @@ "php-http/httplug": "^2.2", "php-http/message": "^1.9", "simplepie/simplepie": "^1.5", - "smalot/pdfparser": "^1.0", - "symfony/options-resolver": "^3.4|^4.4|^5.3", + "smalot/pdfparser": "^2.0", + "symfony/options-resolver": "^3.4|^4.4|^5.3|^6.0", "true/punycode": "^2.1" }, "require-dev": { @@ -39,10 +39,10 @@ "php-http/guzzle6-adapter": "^2.0", "php-http/mock-client": "^1.4", "phpstan/extension-installer": "^1.0", - "phpstan/phpstan": "^0.12", - "phpstan/phpstan-deprecation-rules": "^0.12", - "phpstan/phpstan-phpunit": "^0.12", - "symfony/phpunit-bridge": "^5.3" + "phpstan/phpstan": "^1.2", + "phpstan/phpstan-deprecation-rules": "^1.0", + "phpstan/phpstan-phpunit": "^1.0", + "symfony/phpunit-bridge": "^5.4" }, "extra": { "branch-alias": { diff --git a/phpstan.neon b/phpstan.neon index 720bf5ea..6482e32c 100644 --- a/phpstan.neon +++ b/phpstan.neon @@ -12,15 +12,27 @@ parameters: - message: '#Http\\Adapter\\Guzzle5\\Client\\|Http\\Adapter\\Guzzle6\\Client\\|Http\\Client\\Curl\\Client given#' path: %currentWorkingDirectory%/tests/Extractor/HttpClientTest.php - # we don't want to BC by defining typehint everywhere - # TODO: remove when jumping to 3.0 - - - message: '#typehint specified.#' - path: %currentWorkingDirectory%/src/ # phpstan does not seem to recognize the class override for JSLikeHTMLElement - message: '#Call to an undefined method DOMElement::setInnerHtml\(\)#' path: %currentWorkingDirectory%/src/Extractor/ContentExtractor.php + - + message: '#\$innerHTML#' + path: %currentWorkingDirectory% + # can't find a way to cast or properly defined some return elements + - + message: '#DOMNode, object given#' + path: %currentWorkingDirectory%/src/Extractor/ContentExtractor.php + # other stuff I might have fucked up with DOM* classes + - + message: '#DOMNode::\$tagName#' + path: %currentWorkingDirectory%/src/ + - + message: '#DOMNode::getElementsByTagName#' + path: %currentWorkingDirectory%/src/ + - + message: '#expects DOMElement, DOMNode given#' + path: %currentWorkingDirectory%/src/Graby.php inferPrivatePropertyTypeFromConstructor: true checkMissingIterableValueType: false diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 9319b8fa..71d125b3 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -17,20 +17,24 @@ */ class ContentExtractor { + /** @var Readability|null */ public $readability; + /** @var \DOMXPath|null */ private $xpath; - private $html; - private $config; + private ?string $html = null; + private array $config; + /** @var SiteConfig|null */ private $siteConfig = null; - private $title = null; - private $language = null; - private $authors = []; + private ?string $title = null; + private ?string $language = null; + private array $authors = []; + /** @var \DOMElement|\DOMNode|null */ private $body = null; - private $image = null; - private $nativeAd = false; - private $date = null; - private $success = false; - private $nextPageUrl; + private ?string $image = null; + private bool $nativeAd = false; + private ?string $date = null; + private bool $success = false; + private ?string $nextPageUrl = null; /** @var LoggerInterface */ private $logger; /** @var ConfigBuilder */ @@ -74,13 +78,13 @@ public function __construct($config = [], LoggerInterface $logger = null, Config $this->configBuilder = null === $configBuilder ? new ConfigBuilder($this->config['config_builder'], $this->logger) : $configBuilder; } - public function setLogger(LoggerInterface $logger) + public function setLogger(LoggerInterface $logger): void { $this->logger = $logger; $this->configBuilder->setLogger($logger); } - public function reset() + public function reset(): void { $this->xpath = null; $this->html = null; @@ -100,11 +104,9 @@ public function reset() * Try to find a host depending on a meta that can be in the html. * It allow to determine if a website is generated using Wordpress, Blogger, etc .. * - * @param string $html - * * @return string|false */ - public function findHostUsingFingerprints($html) + public function findHostUsingFingerprints(string $html) { foreach ($this->config['fingerprints'] as $metaPattern => $host) { if (1 === preg_match($metaPattern, $html)) { @@ -117,14 +119,8 @@ public function findHostUsingFingerprints($html) /** * Returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default). - * - * @param string $url - * @param string $html - * @param bool $addToCache - * - * @return SiteConfig */ - public function buildSiteConfig($url, $html = '', $addToCache = true) + public function buildSiteConfig(string $url, string $html = '', bool $addToCache = true): SiteConfig { $config = $this->configBuilder->buildFromUrl($url, $addToCache); @@ -158,14 +154,12 @@ public function buildSiteConfig($url, $html = '', $addToCache = true) * Tidy helps us deal with PHP's patchy HTML parsing most of the time * but it has problems of its own which we try to avoid with this option. * - * @param string $html - * @param string $url * @param SiteConfig $siteConfig Will avoid to recalculate the site config * @param bool $smartTidy Do we need to tidy the html ? * * @return bool true on success, false on failure */ - public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy = true) + public function process(string $html, string $url, SiteConfig $siteConfig = null, $smartTidy = true): bool { $this->reset(); @@ -665,47 +659,50 @@ function ($element, $currentEntity) { return $this->success; } + /** + * @return \DOMElement|\DOMNode|null + */ public function getContent() { return $this->body; } - public function isNativeAd() + public function isNativeAd(): bool { return $this->nativeAd; } - public function getTitle() + public function getTitle(): ?string { return $this->title; } - public function getDate() + public function getDate(): ?string { return $this->date; } - public function getAuthors() + public function getAuthors(): ?array { return $this->authors; } - public function getLanguage() + public function getLanguage(): ?string { return $this->language; } - public function getImage() + public function getImage(): ?string { return $this->image; } - public function getSiteConfig() + public function getSiteConfig(): ?SiteConfig { return $this->siteConfig; } - public function getNextPageUrl() + public function getNextPageUrl(): ?string { return $this->nextPageUrl; } @@ -717,7 +714,7 @@ public function getNextPageUrl() * * @return string|null Formatted date using the W3C format (Y-m-d\TH:i:sP) OR null if the date is badly formatted */ - public function validateDate($date) + public function validateDate(?string $date): ?string { $parseDate = (array) date_parse($date); @@ -739,7 +736,7 @@ public function validateDate($date) return (new \DateTime($date))->format(\DateTime::W3C); } - protected function addAuthor($authorDirty) + protected function addAuthor(string $authorDirty): void { $author = trim($authorDirty); if (!\in_array($author, $this->authors, true)) { @@ -751,10 +748,8 @@ protected function addAuthor($authorDirty) * Check if given node list exists and has length more than 0. * * @param \DOMNodeList|false $elems Not force typed because it can also be false - * - * @return bool */ - private function hasElements($elems = false) + private function hasElements($elems = false): bool { if (false === $elems) { return false; @@ -768,8 +763,10 @@ private function hasElements($elems = false) * * @param \DOMNodeList|false $elems Not force typed because it can also be false * @param string $logMessage + * + * @return void|null */ - private function removeElements($elems = false, $logMessage = null) + private function removeElements($elems = false, string $logMessage = null) { if (false === $elems || false === $this->hasElements($elems)) { return; @@ -796,10 +793,11 @@ private function removeElements($elems = false, $logMessage = null) * Wrap elements with provided tag. * * @param \DOMNodeList|false $elems - * @param string $tag * @param string $logMessage + * + * @return void|null */ - private function wrapElements($elems = false, $tag = 'div', $logMessage = null) + private function wrapElements($elems = false, string $tag = 'div', string $logMessage = null) { if (false === $elems || false === $this->hasElements($elems)) { return; @@ -882,11 +880,10 @@ private function extractEntityFromQuery($entity, $detectEntity, $xpathExpression * @param bool $detectTitle Do we have to detect title ? * @param string $cssClass CSS class to look for * @param \DOMNode|null $node DOMNode to look into - * @param string $logMessage * * @return bool Telling if we have to detect title again or not */ - private function extractTitle($detectTitle, $cssClass, \DOMNode $node = null, $logMessage) + private function extractTitle(bool $detectTitle, string $cssClass, \DOMNode $node = null, string $logMessage): bool { if (null === $node) { return true; @@ -907,11 +904,10 @@ private function extractTitle($detectTitle, $cssClass, \DOMNode $node = null, $l * @param bool $detectDate Do we have to detect date ? * @param string $cssClass CSS class to look for * @param \DOMNode|null $node DOMNode to look into - * @param string $logMessage * * @return bool Telling if we have to detect date again or not */ - private function extractDate($detectDate, $cssClass, \DOMNode $node = null, $logMessage) + private function extractDate(bool $detectDate, string $cssClass, \DOMNode $node = null, string $logMessage): bool { if (null === $node) { return true; @@ -934,7 +930,7 @@ private function extractDate($detectDate, $cssClass, \DOMNode $node = null, $log * * @return bool Telling if we have to detect author again or not */ - private function extractAuthor($detectAuthor, \DOMNode $node = null) + private function extractAuthor(bool $detectAuthor, \DOMNode $node = null): bool { if (false === $detectAuthor) { return false; @@ -981,7 +977,7 @@ private function extractAuthor($detectAuthor, \DOMNode $node = null) * * @return bool Telling if we have to detect body again or not */ - private function extractBody($detectBody, $xpathExpression, \DOMNode $node = null, $type) + private function extractBody(bool $detectBody, string $xpathExpression, \DOMNode $node = null, string $type): bool { if (false === $detectBody) { return false; @@ -1066,10 +1062,8 @@ private function extractBody($detectBody, $xpathExpression, \DOMNode $node = nul * @param string $url URL of the content * @param string $parser Parser to use * @param bool $enableTidy Should it use tidy extension? - * - * @return Readability */ - private function getReadability($html, $url, $parser, $enableTidy) + private function getReadability(string $html, string $url, string $parser, bool $enableTidy): Readability { $readability = new Readability($html, $url, $parser, $enableTidy); @@ -1104,7 +1098,7 @@ private function getReadability($html, $url, $parser, $enableTidy) * * @return bool Telling if the entity has been found */ - private function extractEntityFromPattern($entity, $pattern, $returnCallback = null) + private function extractEntityFromPattern(string $entity, string $pattern, $returnCallback = null): bool { // we define the default callback here if (!\is_callable($returnCallback)) { @@ -1159,7 +1153,7 @@ private function extractEntityFromPattern($entity, $pattern, $returnCallback = n * * @return bool Telling if the entity has been found */ - private function extractMultipleEntityFromPattern($entity, $pattern, $returnCallback = null) + private function extractMultipleEntityFromPattern(string $entity, string $pattern, $returnCallback = null): bool { // we define the default callback here if (!\is_callable($returnCallback)) { @@ -1207,8 +1201,10 @@ private function extractMultipleEntityFromPattern($entity, $pattern, $returnCall * - JSON-LD. * * @param string $html Html from the page + * + * @return void|null */ - private function extractDefinedInformation($html) + private function extractDefinedInformation(string $html) { if ('' === trim($html)) { return; @@ -1236,7 +1232,7 @@ private function extractDefinedInformation($html) * * @see http://stackoverflow.com/a/7454737/569101 */ - private function extractOpenGraph(\DOMXPath $xpath) + private function extractOpenGraph(\DOMXPath $xpath): void { // retrieve "og:" properties $metas = $xpath->query('//*/meta[starts-with(@property, \'og:\')]'); @@ -1316,6 +1312,8 @@ private function extractOpenGraph(\DOMXPath $xpath) /** * Clean extract of JSON-LD authors. + * + * @return array|string */ private function extractAuthorsFromJsonLdArray(array $authors) { @@ -1338,13 +1336,15 @@ private function extractAuthorsFromJsonLdArray(array $authors) * @param \DOMXPath $xpath DOMXpath from the DOMDocument of the page * * @see https://json-ld.org/spec/latest/json-ld/ + * + * @return void|null */ private function extractJsonLdInformation(\DOMXPath $xpath) { $scripts = $xpath->query('//*/script[@type="application/ld+json"]'); if (false === $scripts) { - return null; + return; } $ignoreNames = []; @@ -1364,18 +1364,12 @@ private function extractJsonLdInformation(\DOMXPath $xpath) // just in case datePublished isn't defined, we use the modified one at first if (!empty($data['dateModified'])) { - $this->date = $data['dateModified']; + $this->date = \is_array($data['dateModified']) ? reset($data['dateModified']) : $data['dateModified']; $this->logger->info('date matched from JsonLd: {date}', ['date' => $this->date]); } if (!empty($data['datePublished'])) { - $this->date = $data['datePublished']; - $this->logger->info('date matched from JsonLd: {date}', ['date' => $this->date]); - } - - // sometimes the date is an array - if (\is_array($this->date)) { - $this->date = reset($this->date); + $this->date = \is_array($data['datePublished']) ? reset($data['datePublished']) : $data['datePublished']; $this->logger->info('date matched from JsonLd: {date}', ['date' => $this->date]); } @@ -1408,12 +1402,8 @@ private function extractJsonLdInformation(\DOMXPath $xpath) } if (!empty($data['image']['url'])) { - $this->image = $data['image']['url']; - // some people use ImageObject url field as an array instead of a string... - if (\is_array($data['image']['url'])) { - $this->image = current($data['image']['url']); - } + $this->image = \is_array($data['image']['url']) ? current($data['image']['url']) : $data['image']['url']; } } diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php index ba8d4db7..4f95e7ab 100644 --- a/src/Extractor/HttpClient.php +++ b/src/Extractor/HttpClient.php @@ -109,7 +109,7 @@ public function __construct(Client $client, $config = [], LoggerInterface $logge ); } - public function setLogger(LoggerInterface $logger) + public function setLogger(LoggerInterface $logger): void { $this->logger = $logger; } @@ -282,12 +282,8 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = []) /** * Cleanup URL and retrieve the final url to be called. - * - * @param string $url - * - * @return string */ - private function cleanupUrl($url) + private function cleanupUrl(string $url): string { // rewrite part of urls to something more readable foreach ($this->config['rewrite_url'] as $find => $action) { @@ -323,10 +319,8 @@ private function cleanupUrl($url) * by checking the extension. * * @param string $url Absolute url - * - * @return bool */ - private function possibleUnsupportedType($url) + private function possibleUnsupportedType(string $url): bool { $ext = strtolower(trim(pathinfo($url, \PATHINFO_EXTENSION))); @@ -344,10 +338,8 @@ private function possibleUnsupportedType($url) * * @param string $url Absolute url * @param array $httpHeader Custom HTTP Headers from SiteConfig - * - * @return string */ - private function getUserAgent($url, array $httpHeader = []) + private function getUserAgent(string $url, array $httpHeader = []): string { $ua = $this->config['ua_browser']; @@ -392,10 +384,8 @@ private function getUserAgent($url, array $httpHeader = []) * * @param string $url Absolute url * @param array $httpHeader Custom HTTP Headers from SiteConfig - * - * @return string */ - private function getReferer($url, $httpHeader = []) + private function getReferer(string $url, array $httpHeader = []): string { $default_referer = $this->config['default_referer']; @@ -418,10 +408,8 @@ private function getReferer($url, $httpHeader = []) * * @param string $url Absolute url * @param array $httpHeader Custom HTTP Headers from SiteConfig - * - * @return string|null */ - private function getCookie($url, $httpHeader = []) + private function getCookie(string $url, array $httpHeader = []): ?string { if (!empty($httpHeader['cookie'])) { $this->logger->info('Found cookie "{cookie}" for url "{url}" from site config', ['cookie' => $httpHeader['cookie'], 'url' => $url]); @@ -463,7 +451,7 @@ private function getCookie($url, $httpHeader = []) * * @return string|false */ - private function getAccept($url, $httpHeader = []) + private function getAccept(string $url, array $httpHeader = []) { if (!empty($httpHeader['accept'])) { $this->logger->info('Found accept header "{accept}" for url "{url}" from site config', ['accept' => $httpHeader['accept'], 'url' => $url]); @@ -481,10 +469,8 @@ private function getAccept($url, $httpHeader = []) * Since the request is now done we directly check the Content-Type header * * @param array $headers All headers from the request - * - * @return bool */ - private function headerOnlyType(array $headers) + private function headerOnlyType(array $headers): bool { $contentType = isset($headers['content-type']) ? $headers['content-type'] : ''; @@ -512,7 +498,7 @@ private function headerOnlyType(array $headers) * * @return false|string */ - private function getMetaRefreshURL($url, $html) + private function getMetaRefreshURL(string $url, string $html) { if ('' === $html) { return false; @@ -548,7 +534,7 @@ private function getMetaRefreshURL($url, $html) * * @return false|string */ - private function getUglyURL($url, $html) + private function getUglyURL(string $url, string $html) { $found = false; foreach ($this->config['ajax_triggers'] as $string) { @@ -577,10 +563,8 @@ private function getUglyURL($url, $html) /** * Format all headers to avoid unecessary array level. * Also lower the header name. - * - * @return array */ - private function formatHeaders(ResponseInterface $response) + private function formatHeaders(ResponseInterface $response): array { $headers = []; foreach ($response->getHeaders() as $name => $value) { diff --git a/src/Graby.php b/src/Graby.php index af5053a5..86d11506 100644 --- a/src/Graby.php +++ b/src/Graby.php @@ -27,21 +27,24 @@ */ class Graby { - private $debug = false; + private bool $debug = false; /** @var LoggerInterface */ private $logger; - private $logLevel = 'info'; + private string $logLevel = 'info'; - private $config = []; + private array $config = []; + /** @var HttpClient|null */ private $httpClient = null; + /** @var ContentExtractor|null */ private $extractor = null; /** @var ConfigBuilder */ private $configBuilder; + /** @var Punycode */ private $punycode; - private $imgNoReferrer = false; + private bool $imgNoReferrer = false; /** * @param array $config @@ -128,7 +131,7 @@ public function __construct($config = [], Client $client = null, ConfigBuilder $ /** * Redefine all loggers. */ - public function setLogger(LoggerInterface $logger) + public function setLogger(LoggerInterface $logger): void { $this->logger = $logger; $this->extractor->setLogger($logger); @@ -140,7 +143,7 @@ public function setLogger(LoggerInterface $logger) * * @see ConfigBuilder::loadConfigFiles */ - public function reloadConfigFiles() + public function reloadConfigFiles(): void { $this->configBuilder->loadConfigFiles(); } @@ -148,11 +151,9 @@ public function reloadConfigFiles() /** * Return a config. * - * @param string $key - * * @return mixed */ - public function getConfig($key) + public function getConfig(string $key) { if (!isset($this->config[$key])) { throw new \Exception(sprintf('No config found for key: "%s"', $key)); @@ -164,11 +165,9 @@ public function getConfig($key) /** * Fetch content from the given url and return a readable content. * - * @param string $url - * * @return array With keys html, title, url & summary */ - public function fetchContent($url) + public function fetchContent(string $url): array { $this->logger->info('Graby is ready to fetch'); @@ -180,27 +179,22 @@ public function fetchContent($url) return $infos; } - public function toggleImgNoReferrer($toggle) + public function toggleImgNoReferrer(bool $toggle = false): void { - if (\is_bool($toggle)) { - $this->imgNoReferrer = $toggle; - } + $this->imgNoReferrer = $toggle; } /** * Cleanup HTML from a DOMElement or a string. * - * @param string|\DOMElement $contentBlock - * @param string $url - * - * @return string + * @param string|\DOMElement|\DOMNode $contentBlock */ - public function cleanupHtml($contentBlock, $url) + public function cleanupHtml($contentBlock, string $url): string { - $originalContentBlock = $contentBlock instanceof \DOMElement ? $contentBlock->textContent : $contentBlock; + $originalContentBlock = \is_string($contentBlock) ? $contentBlock : $contentBlock->textContent; // if content is pure html, convert it - if (!$contentBlock instanceof \DOMElement) { + if (\is_string($contentBlock)) { $this->extractor->process($contentBlock, $url); $contentBlock = $this->extractor->getContent(); @@ -275,11 +269,9 @@ public function cleanupHtml($contentBlock, $url) /** * Do fetch content from an url. * - * @param string $url - * * @return array With key status, html, title, language, date, authors, url, image, headers & native_ad */ - private function doFetchContent($url) + private function doFetchContent(string $url): array { $url = $this->validateUrl($url); $siteConfig = $this->configBuilder->buildFromUrl($url); @@ -466,12 +458,8 @@ private function doFetchContent($url) /** * Validate & clean the given url. - * - * @param string $url - * - * @return string */ - private function validateUrl($url) + private function validateUrl(string $url): string { // Check for feed URL $url = trim($url); @@ -518,7 +506,7 @@ private function validateUrl($url) return $url; } - private function isUrlAllowed($url) + private function isUrlAllowed(string $url): bool { $allowedUrls = $this->getConfig('allowed_urls'); $blockedUrls = $this->getConfig('blocked_urls'); @@ -543,12 +531,10 @@ private function isUrlAllowed($url) /** * Based on content-type http header, decide what to do. * - * @param array $headers All headers from the response - * * @return array With keys: 'mime', 'type', 'subtype', 'action', 'name' * e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image') */ - private function getMimeActionInfo(array $headers) + private function getMimeActionInfo(array $headers): array { $contentType = isset($headers['content-type']) ? strtolower($headers['content-type']) : ''; @@ -586,10 +572,8 @@ private function getMimeActionInfo(array $headers) * @param array $mimeInfo From getMimeActionInfo() function * @param string $effectiveUrl Current content url * @param array $response A response - * - * @return array|null */ - private function handleMimeAction($mimeInfo, $effectiveUrl, $response = []) + private function handleMimeAction(array $mimeInfo, string $effectiveUrl, array $response = []): ?array { if (!isset($mimeInfo['action']) || !\in_array($mimeInfo['action'], ['link', 'exclude'], true)) { return null; @@ -678,12 +662,9 @@ private function handleMimeAction($mimeInfo, $effectiveUrl, $response = []) /** * returns single page response, or false if not found. * - * @param string $html - * @param string $url - * * @return false|array From httpClient fetch */ - private function getSinglePage($html, $url) + private function getSinglePage(string $html, string $url) { $this->logger->info('Looking for site config files to see if single page link exists'); $siteConfig = $this->configBuilder->buildFromUrl($url); @@ -777,7 +758,7 @@ private function getSinglePage($html, $url) * @param string $base The base url * @param \DOMElement $elem Element on which we'll retrieve the attribute */ - private function makeAbsolute($base, \DOMElement $elem) + private function makeAbsolute(string $base, \DOMElement $elem): void { foreach (['a' => 'href', 'img' => 'src', 'iframe' => 'src'] as $tag => $attr) { $elems = $elem->getElementsByTagName($tag); @@ -802,7 +783,7 @@ private function makeAbsolute($base, \DOMElement $elem) * @param \DOMNode $e Element on which we'll retrieve the attribute * @param string $attr Attribute that contains the url to absolutize */ - private function makeAbsoluteAttr($base, \DOMNode $e, $attr) + private function makeAbsoluteAttr(string $base, \DOMNode $e, $attr): void { if (!$e->attributes->getNamedItem($attr) || !$e instanceof \DOMElement) { return; @@ -832,7 +813,7 @@ private function makeAbsoluteAttr($base, \DOMNode $e, $attr) * * @return false|string */ - private function makeAbsoluteStr($base, $url) + private function makeAbsoluteStr(string $base, string $url) { if (!$url) { return false; @@ -858,14 +839,8 @@ private function makeAbsoluteStr($base, $url) * Truncate text. * * @see https://github.com/twigphp/Twig-extensions/blob/449e3c8a9ffad7c2479c7864557275a32b037499/lib/Twig/Extensions/Extension/Text.php#L40 - * - * @param string $text - * @param int $length - * @param string $separator - * - * @return string */ - private function getExcerpt($text, $length = 250, $separator = ' …') + private function getExcerpt(string $text, int $length = 250, ?string $separator = ' …'): string { // use regex instead of strip_tags to left some spaces when removing tags $text = preg_replace('#<[^>]+>#', ' ', (string) $text); @@ -894,13 +869,8 @@ private function getExcerpt($text, $length = 250, $separator = ' …') * (uses HTTP headers and HTML to find encoding). * * Adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8 - * - * @param string $html - * @param array $headers All headers from the response - * - * @return string */ - private function convert2Utf8($html, array $headers = []) + private function convert2Utf8(string $html, array $headers = []): string { $contentType = isset($headers['content-type']) ? strtolower($headers['content-type']) : ''; @@ -912,7 +882,7 @@ private function convert2Utf8($html, array $headers = []) // remove strange things $html = str_replace('', '', $html); - if (empty($contentType) || !preg_match_all('/([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $contentType, $match, \PREG_SET_ORDER)) { + if (!preg_match_all('/([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $contentType, $match, \PREG_SET_ORDER)) { // error parsing the response $this->logger->info('Could not find Content-Type header in HTTP response', ['headers' => $headers]); } else { @@ -1002,12 +972,8 @@ private function convert2Utf8($html, array $headers = []) /** * Try to cleanup XSS using htmLawed. - * - * @param string $html - * - * @return string */ - private function cleanupXss($html) + private function cleanupXss(string $html): string { if (false === $this->config['xss_filter']) { return $html; diff --git a/src/HttpClient/Plugin/CookiePlugin.php b/src/HttpClient/Plugin/CookiePlugin.php index 7da4be90..1ba7af4e 100644 --- a/src/HttpClient/Plugin/CookiePlugin.php +++ b/src/HttpClient/Plugin/CookiePlugin.php @@ -95,7 +95,7 @@ private function createCookie(RequestInterface $request, string $setCookieHeader { $parts = array_map('trim', explode(';', $setCookieHeader)); - if (empty($parts) || !strpos($parts[0], '=')) { + if ('' === $parts[0] || false === strpos($parts[0], '=')) { return null; } diff --git a/src/HttpClient/Plugin/History.php b/src/HttpClient/Plugin/History.php index 746e097b..a3e244cc 100644 --- a/src/HttpClient/Plugin/History.php +++ b/src/HttpClient/Plugin/History.php @@ -18,29 +18,23 @@ class History implements Journal */ private $lastResponse; - /** - * @return RequestInterface|null - */ - public function getLastRequest() + public function getLastRequest(): ?RequestInterface { return $this->lastRequest; } - /** - * @return ResponseInterface|null - */ - public function getLastResponse() + public function getLastResponse(): ?ResponseInterface { return $this->lastResponse; } - public function addSuccess(RequestInterface $request, ResponseInterface $response) + public function addSuccess(RequestInterface $request, ResponseInterface $response): void { $this->lastRequest = $request; $this->lastResponse = $response; } - public function addFailure(RequestInterface $request, ClientExceptionInterface $exception) + public function addFailure(RequestInterface $request, ClientExceptionInterface $exception): void { } } diff --git a/src/Monolog/Handler/GrabyHandler.php b/src/Monolog/Handler/GrabyHandler.php index 0d6b3019..a5d8494d 100644 --- a/src/Monolog/Handler/GrabyHandler.php +++ b/src/Monolog/Handler/GrabyHandler.php @@ -13,8 +13,8 @@ */ class GrabyHandler extends AbstractProcessingHandler { - protected $records = []; - protected $recordsByLevel = []; + protected array $records = []; + protected array $recordsByLevel = []; public function __construct($level = Logger::DEBUG, $bubble = true) { @@ -24,18 +24,21 @@ public function __construct($level = Logger::DEBUG, $bubble = true) $this->pushProcessor(new PsrLogMessageProcessor()); } - public function getRecords() + public function getRecords(): array { return $this->records; } - public function clear() + public function clear(): void { $this->records = []; $this->recordsByLevel = []; } - public function hasRecords($level) + /** + * @param string|int $level Logging level value or name + */ + public function hasRecords($level): bool { return isset($this->recordsByLevel[$level]); } diff --git a/src/SiteConfig/ConfigBuilder.php b/src/SiteConfig/ConfigBuilder.php index e7c8af4e..73e37311 100644 --- a/src/SiteConfig/ConfigBuilder.php +++ b/src/SiteConfig/ConfigBuilder.php @@ -11,12 +11,12 @@ class ConfigBuilder { /** @var LoggerInterface */ private $logger; - private $config = []; - private $configFiles = []; - private $cache = []; + private array $config = []; + private array $configFiles = []; + private array $cache = []; // Array for accepted headers for http_header() - private $acceptedHeaders = [ + private array $acceptedHeaders = [ 'user-agent', 'referer', 'cookie', @@ -24,7 +24,7 @@ class ConfigBuilder ]; // Array of accepted HTML tags for wrap_in() - private $acceptedWrapInTags = [ + private array $acceptedWrapInTags = [ 'blockquote', 'p', 'div', @@ -52,7 +52,7 @@ public function __construct($config = [], LoggerInterface $logger = null) $this->loadConfigFiles(); } - public function setLogger(LoggerInterface $logger) + public function setLogger(LoggerInterface $logger): void { $this->logger = $logger; } @@ -64,7 +64,7 @@ public function setLogger(LoggerInterface $logger) * - If we add a new file after, it won't be loaded. * - We'll need to manually reload config files. */ - public function loadConfigFiles() + public function loadConfigFiles(): void { $this->configFiles = Files::getFiles($this->config['site_config']); } @@ -75,7 +75,7 @@ public function loadConfigFiles() * @param string $key Key for the cache * @param SiteConfig $config Config to be cached */ - public function addToCache($key, SiteConfig $config) + public function addToCache($key, SiteConfig $config): void { $key = strtolower($key); if ('www.' === substr($key, 0, 4)) { @@ -419,20 +419,21 @@ public function parseLines(array $lines) * @param SiteConfig $config Current config * @param string $condition XPath condition */ - private function handleIfPageContainsCondition(SiteConfig $config, $condition) + private function handleIfPageContainsCondition(SiteConfig $config, string $condition): void { + $rule = false; if (!empty($config->single_page_link)) { $rule = 'single_page_link'; } elseif (!empty($config->next_page_link)) { $rule = 'next_page_link'; - } else { - // no link found, we can't apply "if_page_contains" - return; } - $key = end($config->$rule); - reset($config->$rule); + // no link found, we can't apply "if_page_contains" + if ($rule) { + $key = end($config->$rule); + reset($config->$rule); - $config->if_page_contains[$rule][$key] = (string) $condition; + $config->if_page_contains[$rule][$key] = (string) $condition; + } } } diff --git a/tests/Extractor/ContentExtractorTest.php b/tests/Extractor/ContentExtractorTest.php index fd4d8906..2a5d8170 100644 --- a/tests/Extractor/ContentExtractorTest.php +++ b/tests/Extractor/ContentExtractorTest.php @@ -163,10 +163,10 @@ public function testProcessFindString(): void $this->assertTrue($res, 'Extraction went well'); - $content_block = $contentExtractor->getContent(); + $contentBlock = $contentExtractor->getContent(); - $this->assertStringContainsString('', $content_block->ownerDocument->saveXML($content_block)); + $this->assertStringContainsString('', (string) $contentBlock->ownerDocument->saveXML($contentBlock)); } public function dataForNextPage(): array @@ -375,7 +375,7 @@ public function testApplyStrip(string $pattern, string $html, string $removedCon $domElement = $contentExtractor->readability->getContent(); $content = $domElement->ownerDocument->saveXML($domElement); - $this->assertStringNotContainsString($removedContent, $content); + $this->assertStringNotContainsString($removedContent, (string) $content); } public function dataForStripIdOrClass(): array @@ -407,9 +407,9 @@ public function testApplyStripIdOrClass(string $pattern, string $html, ?string $ $content = $domElement->ownerDocument->saveXML($domElement); if (null === $removedContent) { - $this->assertStringContainsString((string) $matchContent, $content); + $this->assertStringContainsString((string) $matchContent, (string) $content); } else { - $this->assertStringNotContainsString($removedContent, $content); + $this->assertStringNotContainsString($removedContent, (string) $content); } } @@ -442,7 +442,7 @@ public function testApplyStripImageSrc(string $pattern, string $html, string $re $domElement = $contentExtractor->readability->getContent(); $content = $domElement->ownerDocument->saveXML($domElement); - $this->assertStringNotContainsString($removedContent, $content); + $this->assertStringNotContainsString($removedContent, (string) $content); } public function dataForStripDisplayNoneAndInstapaper(): array @@ -475,7 +475,7 @@ public function testApplyStripDisplayNoneAndInstapaper(string $html, string $rem $domElement = $contentExtractor->readability->getContent(); $content = $domElement->ownerDocument->saveXML($domElement); - $this->assertStringNotContainsString($removedContent, $content); + $this->assertStringNotContainsString($removedContent, (string) $content); } public function dataForStripAttr(): array @@ -514,11 +514,11 @@ public function testApplyStripAttr(array $patterns, string $html, array $asserti $content = $domElement->ownerDocument->saveXML($domElement); foreach ($assertions['removedContent'] as $removedContent) { - $this->assertStringNotContainsString($removedContent, $content); + $this->assertStringNotContainsString($removedContent, (string) $content); } foreach ($assertions['keptContent'] as $keptContent) { - $this->assertStringContainsString($keptContent, $content); + $this->assertStringContainsString($keptContent, (string) $content); } } @@ -719,7 +719,7 @@ public function testRemoveHFromBody(): void $domElement = $contentExtractor->getContent(); $content = $domElement->ownerDocument->saveXML($domElement); - $this->assertStringNotContainsString('My Title', $content); + $this->assertStringNotContainsString('My Title', (string) $content); $this->assertSame('My Title', $contentExtractor->getTitle()); } @@ -793,7 +793,7 @@ public function testConvertLazyLoadImages(string $html, string $htmlExpected): v $domElement = $contentExtractor->getContent(); $content = $domElement->ownerDocument->saveXML($domElement); - $this->assertStringContainsString($htmlExpected, $content); + $this->assertStringContainsString($htmlExpected, (string) $content); } public function testIframeEmbeddedContent(): void @@ -817,7 +817,7 @@ public function testIframeEmbeddedContent(): void $domElement = $contentExtractor->getContent(); $content = $domElement->ownerDocument->saveXML($domElement); - $this->assertStringContainsString('', $content); + $this->assertStringContainsString('', (string) $content); } public function testLogMessage(): void @@ -916,8 +916,8 @@ function handleError(){return true;} $domElement = $contentExtractor->getContent(); $content = $domElement->ownerDocument->saveXML($domElement); - $this->assertStringNotContainsString('', $content); - $this->assertStringNotContainsString('', $content); + $this->assertStringNotContainsString('', (string) $content); + $this->assertStringNotContainsString('', (string) $content); } public function testNativeAd(): void @@ -931,10 +931,10 @@ public function testNativeAd(): void $this->assertTrue($res, 'Extraction went well'); - $content_block = $contentExtractor->getContent(); + $contentBlock = $contentExtractor->getContent(); $this->assertTrue($contentExtractor->isNativeAd()); - $this->assertStringContainsString('

hihi

', $content_block->ownerDocument->saveXML($content_block)); + $this->assertStringContainsString('

hihi

', (string) $contentBlock->ownerDocument->saveXML($contentBlock)); } public function testJsonLd(): void @@ -948,13 +948,13 @@ public function testJsonLd(): void $this->assertTrue($res, 'Extraction went well'); - $content_block = $contentExtractor->getContent(); + $contentBlock = $contentExtractor->getContent(); $this->assertSame('title !!', $contentExtractor->getTitle()); $this->assertSame('2017-10-23T16:05:38+02:00', $contentExtractor->getDate()); - $this->assertStringContainsString('bob', $contentExtractor->getAuthors()[0]); + $this->assertStringContainsString('bob', (string) $contentExtractor->getAuthors()[0]); $this->assertSame('https://static.jsonld.io/medias.jpg', $contentExtractor->getImage()); - $this->assertStringContainsString('

hihi

', $content_block->ownerDocument->saveXML($content_block)); + $this->assertStringContainsString('

hihi

', (string) $contentBlock->ownerDocument->saveXML($contentBlock)); } public function testJsonLdWithMultipleAuthors(): void @@ -966,7 +966,7 @@ public function testJsonLdWithMultipleAuthors(): void 'https://nativead.io/jsonld' ); - $content_block = $contentExtractor->getContent(); + $contentBlock = $contentExtractor->getContent(); $this->assertSame([ 'Elisa Thevenet', @@ -1006,13 +1006,13 @@ public function testOpenGraph(): void $this->assertTrue($res); - $content_block = $contentExtractor->getContent(); + $contentBlock = $contentExtractor->getContent(); $this->assertSame('title !!', $contentExtractor->getTitle()); $this->assertSame('2017-10-23T17:04:21+00:00', $contentExtractor->getDate()); $this->assertSame('fr_FR', $contentExtractor->getLanguage()); $this->assertSame('https://static.opengraph.io/medias_11570.jpg', $contentExtractor->getImage()); - $this->assertStringContainsString('

hihi

', $content_block->ownerDocument->saveXML($content_block)); + $this->assertStringContainsString('

hihi

', (string) $contentBlock->ownerDocument->saveXML($contentBlock)); } public function testAvoidDataUriImageInOpenGraph(): void @@ -1026,10 +1026,10 @@ public function testAvoidDataUriImageInOpenGraph(): void $this->assertTrue($res); - $content_block = $contentExtractor->getContent(); + $contentBlock = $contentExtractor->getContent(); $this->assertEmpty($contentExtractor->getImage()); - $this->assertStringContainsString('

hihi

', $content_block->ownerDocument->saveXML($content_block)); + $this->assertStringContainsString('

hihi

', (string) $contentBlock->ownerDocument->saveXML($contentBlock)); } public function testJsonLdIgnoreList(): void @@ -1044,7 +1044,7 @@ public function testJsonLdIgnoreList(): void $this->assertTrue($res, 'Extraction went well'); $this->assertSame('The Foobar Company is launching globally', $contentExtractor->getTitle()); - $this->assertStringContainsString('Foobar CEO', $contentExtractor->getAuthors()[0]); + $this->assertStringContainsString('Foobar CEO', (string) $contentExtractor->getAuthors()[0]); } public function testJsonLdIgnoreListWithPeriodical(): void @@ -1076,12 +1076,12 @@ public function testJsonLdSkipper(): void $this->assertTrue($res, 'Extraction went well'); - $content_block = $contentExtractor->getContent(); + $contentBlock = $contentExtractor->getContent(); $this->assertEmpty($contentExtractor->getTitle()); $this->assertNull($contentExtractor->getDate()); $this->assertEmpty($contentExtractor->getAuthors()); - $this->assertStringContainsString('this is the best part of the show', $content_block->ownerDocument->saveXML($content_block)); + $this->assertStringContainsString('this is the best part of the show', (string) $contentBlock->ownerDocument->saveXML($contentBlock)); } public function testJsonLdName(): void @@ -1133,7 +1133,7 @@ public function testUniqueAuthors(): void $url, $siteConfig ); - $authors = $contentExtractor->getAuthors(); + $authors = (array) $contentExtractor->getAuthors(); $authorsUnique = array_unique($authors); $this->assertTrue(\count($authors) === \count($authorsUnique), 'There is no duplicate authors'); @@ -1209,9 +1209,9 @@ public function testProcessWrapIn(array $wrapIn, string $xpathQuery): void $this->assertTrue($res, 'Extraction went well'); - $content_block = $contentExtractor->getContent(); + $contentBlock = $contentExtractor->getContent(); $doc = new \DOMDocument(); - $doc->loadXML($content_block->innerHTML); + $doc->loadXML($contentBlock->innerHTML); $xpath = new \DOMXPath($doc); $el = $xpath->query($xpathQuery); From f461285b200c6ab27257cb57c1e4880773c42205 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Mon, 29 Nov 2021 22:51:15 +0100 Subject: [PATCH 2/8] Drop PHP < 7.4 --- .github/workflows/continuous-integration.yml | 5 +---- composer.json | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml index 0310e04e..18da1301 100644 --- a/.github/workflows/continuous-integration.yml +++ b/.github/workflows/continuous-integration.yml @@ -19,9 +19,6 @@ jobs: strategy: matrix: php: - - "7.1" - - "7.2" - - "7.3" - "7.4" - "8.0" - "8.1" @@ -105,7 +102,7 @@ jobs: strategy: matrix: php: - - "7.2" + - "7.4" steps: - name: "Checkout" diff --git a/composer.json b/composer.json index d3dcaae8..1bbb48c2 100644 --- a/composer.json +++ b/composer.json @@ -15,7 +15,7 @@ "minimum-stability": "dev", "prefer-stable": true, "require": { - "php": ">=7.1.3", + "php": ">=7.4", "ext-curl": "*", "ext-tidy": "*", "fossar/htmlawed": "^1.2.7", @@ -42,7 +42,7 @@ "phpstan/phpstan": "^1.2", "phpstan/phpstan-deprecation-rules": "^1.0", "phpstan/phpstan-phpunit": "^1.0", - "symfony/phpunit-bridge": "^5.4" + "symfony/phpunit-bridge": "^6.0" }, "extra": { "branch-alias": { From d025372a6ca3bb59d775cdb4b07c6fbee9103637 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Mon, 29 Nov 2021 22:53:31 +0100 Subject: [PATCH 3/8] Fix test which throw a 404 now :( --- tests/GrabyFunctionalTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/GrabyFunctionalTest.php b/tests/GrabyFunctionalTest.php index d7718d95..30b0a8dc 100644 --- a/tests/GrabyFunctionalTest.php +++ b/tests/GrabyFunctionalTest.php @@ -163,7 +163,7 @@ public function dataWithAccent(): array return [ // ['http://pérotin.com/post/2015/08/31/Le-cadran-solaire-amoureux'], ['https://en.wikipedia.org/wiki/Café'], - ['http://www.atterres.org/article/budget-2016-les-10-méprises-libérales-du-gouvernement'], + // ['http://www.atterres.org/article/budget-2016-les-10-méprises-libérales-du-gouvernement'], ]; } From 552e48e76461767d475b19d288a76a85b9987c41 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Mon, 29 Nov 2021 22:58:43 +0100 Subject: [PATCH 4/8] Fix PHP 8.1 - `strtotime(): Passing null to parameter #1 ($datetime) of type string is deprecated` - `date_parse(): Passing null to parameter #1 ($datetime) of type string is deprecated` --- phpunit.xml | 5 +++++ src/Extractor/ContentExtractor.php | 1 + tests/Extractor/ContentExtractorTest.php | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/phpunit.xml b/phpunit.xml index f3063b5a..811f2849 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -16,6 +16,11 @@ + + + + + ./src diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 71d125b3..6fb80a02 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -716,6 +716,7 @@ public function getNextPageUrl(): ?string */ public function validateDate(?string $date): ?string { + $date = (string) $date; $parseDate = (array) date_parse($date); // If no year has been found during date_parse, we nuke the whole value diff --git a/tests/Extractor/ContentExtractorTest.php b/tests/Extractor/ContentExtractorTest.php index 2a5d8170..84f10b50 100644 --- a/tests/Extractor/ContentExtractorTest.php +++ b/tests/Extractor/ContentExtractorTest.php @@ -847,7 +847,7 @@ public function testLogMessage(): void $this->assertSame('Trying {pattern} for language', $records[4]['message']); $this->assertSame('Trying {pattern} for language', $records[5]['message']); $this->assertSame('Using Readability', $records[6]['message']); - $this->assertSame('Date is bad (strtotime failed): {date}', $records[7]['message']); + $this->assertSame('Date is bad (wrong year): {date}', $records[7]['message']); $this->assertSame('Attempting to parse HTML with {parser}', $records[9]['message']); } From 94568dab7fbda1fbc96a62d0b257adda175a0c57 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Mon, 29 Nov 2021 23:22:45 +0100 Subject: [PATCH 5/8] Prepare 3.0 --- README.md | 2 +- UPGRADE-2.0.md => UPGRADE.md | 13 ++++++++++++- composer.json | 5 ----- 3 files changed, 13 insertions(+), 7 deletions(-) rename UPGRADE-2.0.md => UPGRADE.md (79%) diff --git a/README.md b/README.md index 87db62c6..1b671ac2 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ That's why I made this fork: ### Requirements -- PHP >= 7.1 +- PHP >= 7.4 - [Tidy](https://github.com/htacg/tidy-html5) & cURL extensions enabled ### Installation diff --git a/UPGRADE-2.0.md b/UPGRADE.md similarity index 79% rename from UPGRADE-2.0.md rename to UPGRADE.md index 276a9361..355931e3 100644 --- a/UPGRADE-2.0.md +++ b/UPGRADE.md @@ -1,4 +1,15 @@ -# UPGRADE FROM 1.x to 2.0 +# FROM 2.x to 3.0 + +It should be easy if you didn't override or extend Graby. +I tried to typehint everything (method parameters, method return, variable, etc.). + +So you must update methods you overriden. + +### :warning: BC changes + +- Support for PHP < 7.4 has been dropped + +# FROM 1.x to 2.0 ### :warning: BC changes diff --git a/composer.json b/composer.json index 1bbb48c2..e16fb222 100644 --- a/composer.json +++ b/composer.json @@ -44,11 +44,6 @@ "phpstan/phpstan-phpunit": "^1.0", "symfony/phpunit-bridge": "^6.0" }, - "extra": { - "branch-alias": { - "dev-2.0": "2.0-dev" - } - }, "autoload": { "psr-4": { "Graby\\": "src/" From 89f49f6da125ec12aec5cf4f285796bd796f07ec Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sat, 23 Jan 2021 00:52:07 +0100 Subject: [PATCH 6/8] HttpClient: refactor the way we clean utm_ query params The old preg_replace call incorrectly removed the ? of the query string section which could lead to things like this: http://example.com/foo?utm_source=a&var=value => http://example.com/foo&var=value Signed-off-by: Kevin Decherf --- src/Extractor/HttpClient.php | 7 ++++++- tests/Extractor/HttpClientTest.php | 32 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php index 4f95e7ab..f48cf177 100644 --- a/src/Extractor/HttpClient.php +++ b/src/Extractor/HttpClient.php @@ -263,7 +263,12 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = []) } // remove utm parameters & fragment - $effectiveUrl = preg_replace('/((\?)?(&(amp;)?)?utm_(.*?)\=[^&]+)|(#(.*?)\=[^&]+)/', '', rawurldecode($effectiveUrl)); + $uri = new Uri(str_replace('&', '&', $effectiveUrl)); + parse_str($uri->getQuery(), $query); + $queryParameters = array_filter($query, function ($k) { + return !(0 === stripos($k, 'utm_')); + }, \ARRAY_FILTER_USE_KEY); + $effectiveUrl = (string) Uri::withQueryValues(new Uri($uri->withFragment('')->withQuery('')), $queryParameters); $this->logger->info('Data fetched: {data}', ['data' => [ 'effective_url' => $effectiveUrl, diff --git a/tests/Extractor/HttpClientTest.php b/tests/Extractor/HttpClientTest.php index 3ca97dcd..5b3f7de6 100644 --- a/tests/Extractor/HttpClientTest.php +++ b/tests/Extractor/HttpClientTest.php @@ -620,4 +620,36 @@ public function testAccept(string $url, array $httpHeader, $expectedAccept): voi $this->assertArrayNotHasKey('accept', $records[3]['context']); } } + + public function dataForWithUrlContainingQueryAndFragment(): array + { + return [ + [ + 'url' => 'https://example.com/foo?utm_content=111315005&utm_medium=social&utm_source=twitter&hss_channel=tw-hello', + 'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello', + ], + [ + 'url' => 'https://example.com/foo?hss_channel=tw-hello#fragment', + 'expectedUrl' => 'https://example.com/foo?hss_channel=tw-hello', + ], + [ + 'url' => 'https://example.com/foo?utm_content=111315005', + 'expectedUrl' => 'https://example.com/foo', + ], + ]; + } + + /** + * @dataProvider dataForWithUrlContainingQueryAndFragment + */ + public function testWithUrlContainingQueryAndFragment(string $url, string $expectedUrl): void + { + $httpMockClient = new HttpMockClient(); + $httpMockClient->addResponse(new Response(200)); + + $http = new HttpClient($httpMockClient); + $res = $http->fetch($url); + + $this->assertSame($expectedUrl, $res['effective_url']); + } } From df6c922f7ed33542527b87b9a24d954463dfdd9f Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sat, 23 Jan 2021 18:44:14 +0100 Subject: [PATCH 7/8] tests: fix tests following query string refactor As per RFC3986, Section 3.4 indicates that we can avoid percent-encoding the'/' and '?' characters. However the '=' character in the nested query string should be encoded. Signed-off-by: Kevin Decherf --- tests/GrabyFunctionalTest.php | 2 +- tests/GrabyTest.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/GrabyFunctionalTest.php b/tests/GrabyFunctionalTest.php index 30b0a8dc..bc31b12e 100644 --- a/tests/GrabyFunctionalTest.php +++ b/tests/GrabyFunctionalTest.php @@ -220,7 +220,7 @@ public function testYoutubeOembed(): void $this->assertSame(200, $res['status']); $this->assertEmpty($res['language']); - $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v=td0P8qrS8iI&format=xml', $res['url']); + $this->assertSame('https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v%3Dtd0P8qrS8iI&format=xml', $res['url']); $this->assertSame('[Review] The Matrix Falling (Rain) Source Code C++', $res['title']); // $this->assertSame('', $res['html']); $this->assertSame('[embedded content]', $res['summary']); diff --git a/tests/GrabyTest.php b/tests/GrabyTest.php index 3c349b5f..38f0516f 100644 --- a/tests/GrabyTest.php +++ b/tests/GrabyTest.php @@ -403,7 +403,7 @@ public function testAssetExtensionTXT(): void public function dataForSinglePage(): array { return [ - 'single_page_link will return a string (ie the text content of node)' => ['singlepage1.com', 'http://singlepage1.com/printed view', 'http://moreintelligentlife.com/print/content'], + 'single_page_link will return a string (ie the text content of node)' => ['singlepage1.com', 'http://singlepage1.com/printed%20view', 'http://moreintelligentlife.com/print/content'], 'single_page_link will return the a node' => ['singlepage2.com', 'http://singlepage2.com/print/content', 'http://singlepage2.com/print/content'], 'single_page_link will return the href from a node' => ['singlepage3.com', 'http://singlepage3.com/print/content', 'http://singlepage3.com/print/content'], 'single_page_link will return nothing useful' => ['singlepage4.com', 'http://singlepage4.com', 'http://singlepage4.com/print/content'], From 8bcef771099f4d365c9d15a5a5fb2221d37bc330 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sat, 23 Jan 2021 19:05:25 +0100 Subject: [PATCH 8/8] dependencies: set min version of guzzlehttp/psr7 to 1.5.0 Uri::withQueryValues() has been added in psr7 1.5.0 but our direct dependency http-factory-guzzle is currently set to require ^1.4.2 which could cause issues on some installations. Signed-off-by: Kevin Decherf --- composer.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/composer.json b/composer.json index e16fb222..cd393108 100644 --- a/composer.json +++ b/composer.json @@ -31,7 +31,8 @@ "simplepie/simplepie": "^1.5", "smalot/pdfparser": "^2.0", "symfony/options-resolver": "^3.4|^4.4|^5.3|^6.0", - "true/punycode": "^2.1" + "true/punycode": "^2.1", + "guzzlehttp/psr7": "^1.5.0" }, "require-dev": { "friendsofphp/php-cs-fixer": "^3.0",