diff --git a/.github/workflows/coding-standards.yml b/.github/workflows/coding-standards.yml
index 5d8714c4..6718bb3e 100644
--- a/.github/workflows/coding-standards.yml
+++ b/.github/workflows/coding-standards.yml
@@ -16,7 +16,7 @@ jobs:
strategy:
matrix:
php:
- - "7.3"
+ - "7.4"
steps:
- name: "Checkout"
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
index a59ecd7c..18da1301 100644
--- a/.github/workflows/continuous-integration.yml
+++ b/.github/workflows/continuous-integration.yml
@@ -19,11 +19,9 @@ jobs:
strategy:
matrix:
php:
- - "7.1"
- - "7.2"
- - "7.3"
- "7.4"
- "8.0"
+ - "8.1"
steps:
- name: "Checkout"
@@ -104,7 +102,7 @@ jobs:
strategy:
matrix:
php:
- - "7.2"
+ - "7.4"
steps:
- name: "Checkout"
diff --git a/README.md b/README.md
index 87db62c6..1b671ac2 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ That's why I made this fork:
### Requirements
-- PHP >= 7.1
+- PHP >= 7.4
- [Tidy](https://github.com/htacg/tidy-html5) & cURL extensions enabled
### Installation
diff --git a/UPGRADE-2.0.md b/UPGRADE.md
similarity index 79%
rename from UPGRADE-2.0.md
rename to UPGRADE.md
index 276a9361..355931e3 100644
--- a/UPGRADE-2.0.md
+++ b/UPGRADE.md
@@ -1,4 +1,15 @@
-# UPGRADE FROM 1.x to 2.0
+# FROM 2.x to 3.0
+
+It should be easy if you didn't override or extend Graby.
+I tried to typehint everything (method parameters, method return, variable, etc.).
+
+So you must update methods you overriden.
+
+### :warning: BC changes
+
+- Support for PHP < 7.4 has been dropped
+
+# FROM 1.x to 2.0
### :warning: BC changes
diff --git a/composer.json b/composer.json
index e6068e94..cd393108 100644
--- a/composer.json
+++ b/composer.json
@@ -15,7 +15,7 @@
"minimum-stability": "dev",
"prefer-stable": true,
"require": {
- "php": ">=7.1.3",
+ "php": ">=7.4",
"ext-curl": "*",
"ext-tidy": "*",
"fossar/htmlawed": "^1.2.7",
@@ -29,9 +29,10 @@
"php-http/httplug": "^2.2",
"php-http/message": "^1.9",
"simplepie/simplepie": "^1.5",
- "smalot/pdfparser": "^1.0",
- "symfony/options-resolver": "^3.4|^4.4|^5.3",
- "true/punycode": "^2.1"
+ "smalot/pdfparser": "^2.0",
+ "symfony/options-resolver": "^3.4|^4.4|^5.3|^6.0",
+ "true/punycode": "^2.1",
+ "guzzlehttp/psr7": "^1.5.0"
},
"require-dev": {
"friendsofphp/php-cs-fixer": "^3.0",
@@ -39,15 +40,10 @@
"php-http/guzzle6-adapter": "^2.0",
"php-http/mock-client": "^1.4",
"phpstan/extension-installer": "^1.0",
- "phpstan/phpstan": "^0.12",
- "phpstan/phpstan-deprecation-rules": "^0.12",
- "phpstan/phpstan-phpunit": "^0.12",
- "symfony/phpunit-bridge": "^5.3"
- },
- "extra": {
- "branch-alias": {
- "dev-2.0": "2.0-dev"
- }
+ "phpstan/phpstan": "^1.2",
+ "phpstan/phpstan-deprecation-rules": "^1.0",
+ "phpstan/phpstan-phpunit": "^1.0",
+ "symfony/phpunit-bridge": "^6.0"
},
"autoload": {
"psr-4": {
diff --git a/phpstan.neon b/phpstan.neon
index 720bf5ea..6482e32c 100644
--- a/phpstan.neon
+++ b/phpstan.neon
@@ -12,15 +12,27 @@ parameters:
-
message: '#Http\\Adapter\\Guzzle5\\Client\\|Http\\Adapter\\Guzzle6\\Client\\|Http\\Client\\Curl\\Client given#'
path: %currentWorkingDirectory%/tests/Extractor/HttpClientTest.php
- # we don't want to BC by defining typehint everywhere
- # TODO: remove when jumping to 3.0
- -
- message: '#typehint specified.#'
- path: %currentWorkingDirectory%/src/
# phpstan does not seem to recognize the class override for JSLikeHTMLElement
-
message: '#Call to an undefined method DOMElement::setInnerHtml\(\)#'
path: %currentWorkingDirectory%/src/Extractor/ContentExtractor.php
+ -
+ message: '#\$innerHTML#'
+ path: %currentWorkingDirectory%
+ # can't find a way to cast or properly defined some return elements
+ -
+ message: '#DOMNode, object given#'
+ path: %currentWorkingDirectory%/src/Extractor/ContentExtractor.php
+ # other stuff I might have fucked up with DOM* classes
+ -
+ message: '#DOMNode::\$tagName#'
+ path: %currentWorkingDirectory%/src/
+ -
+ message: '#DOMNode::getElementsByTagName#'
+ path: %currentWorkingDirectory%/src/
+ -
+ message: '#expects DOMElement, DOMNode given#'
+ path: %currentWorkingDirectory%/src/Graby.php
inferPrivatePropertyTypeFromConstructor: true
checkMissingIterableValueType: false
diff --git a/phpunit.xml b/phpunit.xml
index f3063b5a..811f2849 100644
--- a/phpunit.xml
+++ b/phpunit.xml
@@ -16,6 +16,11 @@
+
+
+
+
+
./src
diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php
index 9319b8fa..6fb80a02 100644
--- a/src/Extractor/ContentExtractor.php
+++ b/src/Extractor/ContentExtractor.php
@@ -17,20 +17,24 @@
*/
class ContentExtractor
{
+ /** @var Readability|null */
public $readability;
+ /** @var \DOMXPath|null */
private $xpath;
- private $html;
- private $config;
+ private ?string $html = null;
+ private array $config;
+ /** @var SiteConfig|null */
private $siteConfig = null;
- private $title = null;
- private $language = null;
- private $authors = [];
+ private ?string $title = null;
+ private ?string $language = null;
+ private array $authors = [];
+ /** @var \DOMElement|\DOMNode|null */
private $body = null;
- private $image = null;
- private $nativeAd = false;
- private $date = null;
- private $success = false;
- private $nextPageUrl;
+ private ?string $image = null;
+ private bool $nativeAd = false;
+ private ?string $date = null;
+ private bool $success = false;
+ private ?string $nextPageUrl = null;
/** @var LoggerInterface */
private $logger;
/** @var ConfigBuilder */
@@ -74,13 +78,13 @@ public function __construct($config = [], LoggerInterface $logger = null, Config
$this->configBuilder = null === $configBuilder ? new ConfigBuilder($this->config['config_builder'], $this->logger) : $configBuilder;
}
- public function setLogger(LoggerInterface $logger)
+ public function setLogger(LoggerInterface $logger): void
{
$this->logger = $logger;
$this->configBuilder->setLogger($logger);
}
- public function reset()
+ public function reset(): void
{
$this->xpath = null;
$this->html = null;
@@ -100,11 +104,9 @@ public function reset()
* Try to find a host depending on a meta that can be in the html.
* It allow to determine if a website is generated using Wordpress, Blogger, etc ..
*
- * @param string $html
- *
* @return string|false
*/
- public function findHostUsingFingerprints($html)
+ public function findHostUsingFingerprints(string $html)
{
foreach ($this->config['fingerprints'] as $metaPattern => $host) {
if (1 === preg_match($metaPattern, $html)) {
@@ -117,14 +119,8 @@ public function findHostUsingFingerprints($html)
/**
* Returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default).
- *
- * @param string $url
- * @param string $html
- * @param bool $addToCache
- *
- * @return SiteConfig
*/
- public function buildSiteConfig($url, $html = '', $addToCache = true)
+ public function buildSiteConfig(string $url, string $html = '', bool $addToCache = true): SiteConfig
{
$config = $this->configBuilder->buildFromUrl($url, $addToCache);
@@ -158,14 +154,12 @@ public function buildSiteConfig($url, $html = '', $addToCache = true)
* Tidy helps us deal with PHP's patchy HTML parsing most of the time
* but it has problems of its own which we try to avoid with this option.
*
- * @param string $html
- * @param string $url
* @param SiteConfig $siteConfig Will avoid to recalculate the site config
* @param bool $smartTidy Do we need to tidy the html ?
*
* @return bool true on success, false on failure
*/
- public function process($html, $url, SiteConfig $siteConfig = null, $smartTidy = true)
+ public function process(string $html, string $url, SiteConfig $siteConfig = null, $smartTidy = true): bool
{
$this->reset();
@@ -665,47 +659,50 @@ function ($element, $currentEntity) {
return $this->success;
}
+ /**
+ * @return \DOMElement|\DOMNode|null
+ */
public function getContent()
{
return $this->body;
}
- public function isNativeAd()
+ public function isNativeAd(): bool
{
return $this->nativeAd;
}
- public function getTitle()
+ public function getTitle(): ?string
{
return $this->title;
}
- public function getDate()
+ public function getDate(): ?string
{
return $this->date;
}
- public function getAuthors()
+ public function getAuthors(): ?array
{
return $this->authors;
}
- public function getLanguage()
+ public function getLanguage(): ?string
{
return $this->language;
}
- public function getImage()
+ public function getImage(): ?string
{
return $this->image;
}
- public function getSiteConfig()
+ public function getSiteConfig(): ?SiteConfig
{
return $this->siteConfig;
}
- public function getNextPageUrl()
+ public function getNextPageUrl(): ?string
{
return $this->nextPageUrl;
}
@@ -717,8 +714,9 @@ public function getNextPageUrl()
*
* @return string|null Formatted date using the W3C format (Y-m-d\TH:i:sP) OR null if the date is badly formatted
*/
- public function validateDate($date)
+ public function validateDate(?string $date): ?string
{
+ $date = (string) $date;
$parseDate = (array) date_parse($date);
// If no year has been found during date_parse, we nuke the whole value
@@ -739,7 +737,7 @@ public function validateDate($date)
return (new \DateTime($date))->format(\DateTime::W3C);
}
- protected function addAuthor($authorDirty)
+ protected function addAuthor(string $authorDirty): void
{
$author = trim($authorDirty);
if (!\in_array($author, $this->authors, true)) {
@@ -751,10 +749,8 @@ protected function addAuthor($authorDirty)
* Check if given node list exists and has length more than 0.
*
* @param \DOMNodeList|false $elems Not force typed because it can also be false
- *
- * @return bool
*/
- private function hasElements($elems = false)
+ private function hasElements($elems = false): bool
{
if (false === $elems) {
return false;
@@ -768,8 +764,10 @@ private function hasElements($elems = false)
*
* @param \DOMNodeList|false $elems Not force typed because it can also be false
* @param string $logMessage
+ *
+ * @return void|null
*/
- private function removeElements($elems = false, $logMessage = null)
+ private function removeElements($elems = false, string $logMessage = null)
{
if (false === $elems || false === $this->hasElements($elems)) {
return;
@@ -796,10 +794,11 @@ private function removeElements($elems = false, $logMessage = null)
* Wrap elements with provided tag.
*
* @param \DOMNodeList|false $elems
- * @param string $tag
* @param string $logMessage
+ *
+ * @return void|null
*/
- private function wrapElements($elems = false, $tag = 'div', $logMessage = null)
+ private function wrapElements($elems = false, string $tag = 'div', string $logMessage = null)
{
if (false === $elems || false === $this->hasElements($elems)) {
return;
@@ -882,11 +881,10 @@ private function extractEntityFromQuery($entity, $detectEntity, $xpathExpression
* @param bool $detectTitle Do we have to detect title ?
* @param string $cssClass CSS class to look for
* @param \DOMNode|null $node DOMNode to look into
- * @param string $logMessage
*
* @return bool Telling if we have to detect title again or not
*/
- private function extractTitle($detectTitle, $cssClass, \DOMNode $node = null, $logMessage)
+ private function extractTitle(bool $detectTitle, string $cssClass, \DOMNode $node = null, string $logMessage): bool
{
if (null === $node) {
return true;
@@ -907,11 +905,10 @@ private function extractTitle($detectTitle, $cssClass, \DOMNode $node = null, $l
* @param bool $detectDate Do we have to detect date ?
* @param string $cssClass CSS class to look for
* @param \DOMNode|null $node DOMNode to look into
- * @param string $logMessage
*
* @return bool Telling if we have to detect date again or not
*/
- private function extractDate($detectDate, $cssClass, \DOMNode $node = null, $logMessage)
+ private function extractDate(bool $detectDate, string $cssClass, \DOMNode $node = null, string $logMessage): bool
{
if (null === $node) {
return true;
@@ -934,7 +931,7 @@ private function extractDate($detectDate, $cssClass, \DOMNode $node = null, $log
*
* @return bool Telling if we have to detect author again or not
*/
- private function extractAuthor($detectAuthor, \DOMNode $node = null)
+ private function extractAuthor(bool $detectAuthor, \DOMNode $node = null): bool
{
if (false === $detectAuthor) {
return false;
@@ -981,7 +978,7 @@ private function extractAuthor($detectAuthor, \DOMNode $node = null)
*
* @return bool Telling if we have to detect body again or not
*/
- private function extractBody($detectBody, $xpathExpression, \DOMNode $node = null, $type)
+ private function extractBody(bool $detectBody, string $xpathExpression, \DOMNode $node = null, string $type): bool
{
if (false === $detectBody) {
return false;
@@ -1066,10 +1063,8 @@ private function extractBody($detectBody, $xpathExpression, \DOMNode $node = nul
* @param string $url URL of the content
* @param string $parser Parser to use
* @param bool $enableTidy Should it use tidy extension?
- *
- * @return Readability
*/
- private function getReadability($html, $url, $parser, $enableTidy)
+ private function getReadability(string $html, string $url, string $parser, bool $enableTidy): Readability
{
$readability = new Readability($html, $url, $parser, $enableTidy);
@@ -1104,7 +1099,7 @@ private function getReadability($html, $url, $parser, $enableTidy)
*
* @return bool Telling if the entity has been found
*/
- private function extractEntityFromPattern($entity, $pattern, $returnCallback = null)
+ private function extractEntityFromPattern(string $entity, string $pattern, $returnCallback = null): bool
{
// we define the default callback here
if (!\is_callable($returnCallback)) {
@@ -1159,7 +1154,7 @@ private function extractEntityFromPattern($entity, $pattern, $returnCallback = n
*
* @return bool Telling if the entity has been found
*/
- private function extractMultipleEntityFromPattern($entity, $pattern, $returnCallback = null)
+ private function extractMultipleEntityFromPattern(string $entity, string $pattern, $returnCallback = null): bool
{
// we define the default callback here
if (!\is_callable($returnCallback)) {
@@ -1207,8 +1202,10 @@ private function extractMultipleEntityFromPattern($entity, $pattern, $returnCall
* - JSON-LD.
*
* @param string $html Html from the page
+ *
+ * @return void|null
*/
- private function extractDefinedInformation($html)
+ private function extractDefinedInformation(string $html)
{
if ('' === trim($html)) {
return;
@@ -1236,7 +1233,7 @@ private function extractDefinedInformation($html)
*
* @see http://stackoverflow.com/a/7454737/569101
*/
- private function extractOpenGraph(\DOMXPath $xpath)
+ private function extractOpenGraph(\DOMXPath $xpath): void
{
// retrieve "og:" properties
$metas = $xpath->query('//*/meta[starts-with(@property, \'og:\')]');
@@ -1316,6 +1313,8 @@ private function extractOpenGraph(\DOMXPath $xpath)
/**
* Clean extract of JSON-LD authors.
+ *
+ * @return array|string
*/
private function extractAuthorsFromJsonLdArray(array $authors)
{
@@ -1338,13 +1337,15 @@ private function extractAuthorsFromJsonLdArray(array $authors)
* @param \DOMXPath $xpath DOMXpath from the DOMDocument of the page
*
* @see https://json-ld.org/spec/latest/json-ld/
+ *
+ * @return void|null
*/
private function extractJsonLdInformation(\DOMXPath $xpath)
{
$scripts = $xpath->query('//*/script[@type="application/ld+json"]');
if (false === $scripts) {
- return null;
+ return;
}
$ignoreNames = [];
@@ -1364,18 +1365,12 @@ private function extractJsonLdInformation(\DOMXPath $xpath)
// just in case datePublished isn't defined, we use the modified one at first
if (!empty($data['dateModified'])) {
- $this->date = $data['dateModified'];
+ $this->date = \is_array($data['dateModified']) ? reset($data['dateModified']) : $data['dateModified'];
$this->logger->info('date matched from JsonLd: {date}', ['date' => $this->date]);
}
if (!empty($data['datePublished'])) {
- $this->date = $data['datePublished'];
- $this->logger->info('date matched from JsonLd: {date}', ['date' => $this->date]);
- }
-
- // sometimes the date is an array
- if (\is_array($this->date)) {
- $this->date = reset($this->date);
+ $this->date = \is_array($data['datePublished']) ? reset($data['datePublished']) : $data['datePublished'];
$this->logger->info('date matched from JsonLd: {date}', ['date' => $this->date]);
}
@@ -1408,12 +1403,8 @@ private function extractJsonLdInformation(\DOMXPath $xpath)
}
if (!empty($data['image']['url'])) {
- $this->image = $data['image']['url'];
-
// some people use ImageObject url field as an array instead of a string...
- if (\is_array($data['image']['url'])) {
- $this->image = current($data['image']['url']);
- }
+ $this->image = \is_array($data['image']['url']) ? current($data['image']['url']) : $data['image']['url'];
}
}
diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php
index ba8d4db7..f48cf177 100644
--- a/src/Extractor/HttpClient.php
+++ b/src/Extractor/HttpClient.php
@@ -109,7 +109,7 @@ public function __construct(Client $client, $config = [], LoggerInterface $logge
);
}
- public function setLogger(LoggerInterface $logger)
+ public function setLogger(LoggerInterface $logger): void
{
$this->logger = $logger;
}
@@ -263,7 +263,12 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
}
// remove utm parameters & fragment
- $effectiveUrl = preg_replace('/((\?)?(&(amp;)?)?utm_(.*?)\=[^&]+)|(#(.*?)\=[^&]+)/', '', rawurldecode($effectiveUrl));
+ $uri = new Uri(str_replace('&', '&', $effectiveUrl));
+ parse_str($uri->getQuery(), $query);
+ $queryParameters = array_filter($query, function ($k) {
+ return !(0 === stripos($k, 'utm_'));
+ }, \ARRAY_FILTER_USE_KEY);
+ $effectiveUrl = (string) Uri::withQueryValues(new Uri($uri->withFragment('')->withQuery('')), $queryParameters);
$this->logger->info('Data fetched: {data}', ['data' => [
'effective_url' => $effectiveUrl,
@@ -282,12 +287,8 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = [])
/**
* Cleanup URL and retrieve the final url to be called.
- *
- * @param string $url
- *
- * @return string
*/
- private function cleanupUrl($url)
+ private function cleanupUrl(string $url): string
{
// rewrite part of urls to something more readable
foreach ($this->config['rewrite_url'] as $find => $action) {
@@ -323,10 +324,8 @@ private function cleanupUrl($url)
* by checking the extension.
*
* @param string $url Absolute url
- *
- * @return bool
*/
- private function possibleUnsupportedType($url)
+ private function possibleUnsupportedType(string $url): bool
{
$ext = strtolower(trim(pathinfo($url, \PATHINFO_EXTENSION)));
@@ -344,10 +343,8 @@ private function possibleUnsupportedType($url)
*
* @param string $url Absolute url
* @param array $httpHeader Custom HTTP Headers from SiteConfig
- *
- * @return string
*/
- private function getUserAgent($url, array $httpHeader = [])
+ private function getUserAgent(string $url, array $httpHeader = []): string
{
$ua = $this->config['ua_browser'];
@@ -392,10 +389,8 @@ private function getUserAgent($url, array $httpHeader = [])
*
* @param string $url Absolute url
* @param array $httpHeader Custom HTTP Headers from SiteConfig
- *
- * @return string
*/
- private function getReferer($url, $httpHeader = [])
+ private function getReferer(string $url, array $httpHeader = []): string
{
$default_referer = $this->config['default_referer'];
@@ -418,10 +413,8 @@ private function getReferer($url, $httpHeader = [])
*
* @param string $url Absolute url
* @param array $httpHeader Custom HTTP Headers from SiteConfig
- *
- * @return string|null
*/
- private function getCookie($url, $httpHeader = [])
+ private function getCookie(string $url, array $httpHeader = []): ?string
{
if (!empty($httpHeader['cookie'])) {
$this->logger->info('Found cookie "{cookie}" for url "{url}" from site config', ['cookie' => $httpHeader['cookie'], 'url' => $url]);
@@ -463,7 +456,7 @@ private function getCookie($url, $httpHeader = [])
*
* @return string|false
*/
- private function getAccept($url, $httpHeader = [])
+ private function getAccept(string $url, array $httpHeader = [])
{
if (!empty($httpHeader['accept'])) {
$this->logger->info('Found accept header "{accept}" for url "{url}" from site config', ['accept' => $httpHeader['accept'], 'url' => $url]);
@@ -481,10 +474,8 @@ private function getAccept($url, $httpHeader = [])
* Since the request is now done we directly check the Content-Type header
*
* @param array $headers All headers from the request
- *
- * @return bool
*/
- private function headerOnlyType(array $headers)
+ private function headerOnlyType(array $headers): bool
{
$contentType = isset($headers['content-type']) ? $headers['content-type'] : '';
@@ -512,7 +503,7 @@ private function headerOnlyType(array $headers)
*
* @return false|string
*/
- private function getMetaRefreshURL($url, $html)
+ private function getMetaRefreshURL(string $url, string $html)
{
if ('' === $html) {
return false;
@@ -548,7 +539,7 @@ private function getMetaRefreshURL($url, $html)
*
* @return false|string
*/
- private function getUglyURL($url, $html)
+ private function getUglyURL(string $url, string $html)
{
$found = false;
foreach ($this->config['ajax_triggers'] as $string) {
@@ -577,10 +568,8 @@ private function getUglyURL($url, $html)
/**
* Format all headers to avoid unecessary array level.
* Also lower the header name.
- *
- * @return array
*/
- private function formatHeaders(ResponseInterface $response)
+ private function formatHeaders(ResponseInterface $response): array
{
$headers = [];
foreach ($response->getHeaders() as $name => $value) {
diff --git a/src/Graby.php b/src/Graby.php
index af5053a5..86d11506 100644
--- a/src/Graby.php
+++ b/src/Graby.php
@@ -27,21 +27,24 @@
*/
class Graby
{
- private $debug = false;
+ private bool $debug = false;
/** @var LoggerInterface */
private $logger;
- private $logLevel = 'info';
+ private string $logLevel = 'info';
- private $config = [];
+ private array $config = [];
+ /** @var HttpClient|null */
private $httpClient = null;
+ /** @var ContentExtractor|null */
private $extractor = null;
/** @var ConfigBuilder */
private $configBuilder;
+ /** @var Punycode */
private $punycode;
- private $imgNoReferrer = false;
+ private bool $imgNoReferrer = false;
/**
* @param array $config
@@ -128,7 +131,7 @@ public function __construct($config = [], Client $client = null, ConfigBuilder $
/**
* Redefine all loggers.
*/
- public function setLogger(LoggerInterface $logger)
+ public function setLogger(LoggerInterface $logger): void
{
$this->logger = $logger;
$this->extractor->setLogger($logger);
@@ -140,7 +143,7 @@ public function setLogger(LoggerInterface $logger)
*
* @see ConfigBuilder::loadConfigFiles
*/
- public function reloadConfigFiles()
+ public function reloadConfigFiles(): void
{
$this->configBuilder->loadConfigFiles();
}
@@ -148,11 +151,9 @@ public function reloadConfigFiles()
/**
* Return a config.
*
- * @param string $key
- *
* @return mixed
*/
- public function getConfig($key)
+ public function getConfig(string $key)
{
if (!isset($this->config[$key])) {
throw new \Exception(sprintf('No config found for key: "%s"', $key));
@@ -164,11 +165,9 @@ public function getConfig($key)
/**
* Fetch content from the given url and return a readable content.
*
- * @param string $url
- *
* @return array With keys html, title, url & summary
*/
- public function fetchContent($url)
+ public function fetchContent(string $url): array
{
$this->logger->info('Graby is ready to fetch');
@@ -180,27 +179,22 @@ public function fetchContent($url)
return $infos;
}
- public function toggleImgNoReferrer($toggle)
+ public function toggleImgNoReferrer(bool $toggle = false): void
{
- if (\is_bool($toggle)) {
- $this->imgNoReferrer = $toggle;
- }
+ $this->imgNoReferrer = $toggle;
}
/**
* Cleanup HTML from a DOMElement or a string.
*
- * @param string|\DOMElement $contentBlock
- * @param string $url
- *
- * @return string
+ * @param string|\DOMElement|\DOMNode $contentBlock
*/
- public function cleanupHtml($contentBlock, $url)
+ public function cleanupHtml($contentBlock, string $url): string
{
- $originalContentBlock = $contentBlock instanceof \DOMElement ? $contentBlock->textContent : $contentBlock;
+ $originalContentBlock = \is_string($contentBlock) ? $contentBlock : $contentBlock->textContent;
// if content is pure html, convert it
- if (!$contentBlock instanceof \DOMElement) {
+ if (\is_string($contentBlock)) {
$this->extractor->process($contentBlock, $url);
$contentBlock = $this->extractor->getContent();
@@ -275,11 +269,9 @@ public function cleanupHtml($contentBlock, $url)
/**
* Do fetch content from an url.
*
- * @param string $url
- *
* @return array With key status, html, title, language, date, authors, url, image, headers & native_ad
*/
- private function doFetchContent($url)
+ private function doFetchContent(string $url): array
{
$url = $this->validateUrl($url);
$siteConfig = $this->configBuilder->buildFromUrl($url);
@@ -466,12 +458,8 @@ private function doFetchContent($url)
/**
* Validate & clean the given url.
- *
- * @param string $url
- *
- * @return string
*/
- private function validateUrl($url)
+ private function validateUrl(string $url): string
{
// Check for feed URL
$url = trim($url);
@@ -518,7 +506,7 @@ private function validateUrl($url)
return $url;
}
- private function isUrlAllowed($url)
+ private function isUrlAllowed(string $url): bool
{
$allowedUrls = $this->getConfig('allowed_urls');
$blockedUrls = $this->getConfig('blocked_urls');
@@ -543,12 +531,10 @@ private function isUrlAllowed($url)
/**
* Based on content-type http header, decide what to do.
*
- * @param array $headers All headers from the response
- *
* @return array With keys: 'mime', 'type', 'subtype', 'action', 'name'
* e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
*/
- private function getMimeActionInfo(array $headers)
+ private function getMimeActionInfo(array $headers): array
{
$contentType = isset($headers['content-type']) ? strtolower($headers['content-type']) : '';
@@ -586,10 +572,8 @@ private function getMimeActionInfo(array $headers)
* @param array $mimeInfo From getMimeActionInfo() function
* @param string $effectiveUrl Current content url
* @param array $response A response
- *
- * @return array|null
*/
- private function handleMimeAction($mimeInfo, $effectiveUrl, $response = [])
+ private function handleMimeAction(array $mimeInfo, string $effectiveUrl, array $response = []): ?array
{
if (!isset($mimeInfo['action']) || !\in_array($mimeInfo['action'], ['link', 'exclude'], true)) {
return null;
@@ -678,12 +662,9 @@ private function handleMimeAction($mimeInfo, $effectiveUrl, $response = [])
/**
* returns single page response, or false if not found.
*
- * @param string $html
- * @param string $url
- *
* @return false|array From httpClient fetch
*/
- private function getSinglePage($html, $url)
+ private function getSinglePage(string $html, string $url)
{
$this->logger->info('Looking for site config files to see if single page link exists');
$siteConfig = $this->configBuilder->buildFromUrl($url);
@@ -777,7 +758,7 @@ private function getSinglePage($html, $url)
* @param string $base The base url
* @param \DOMElement $elem Element on which we'll retrieve the attribute
*/
- private function makeAbsolute($base, \DOMElement $elem)
+ private function makeAbsolute(string $base, \DOMElement $elem): void
{
foreach (['a' => 'href', 'img' => 'src', 'iframe' => 'src'] as $tag => $attr) {
$elems = $elem->getElementsByTagName($tag);
@@ -802,7 +783,7 @@ private function makeAbsolute($base, \DOMElement $elem)
* @param \DOMNode $e Element on which we'll retrieve the attribute
* @param string $attr Attribute that contains the url to absolutize
*/
- private function makeAbsoluteAttr($base, \DOMNode $e, $attr)
+ private function makeAbsoluteAttr(string $base, \DOMNode $e, $attr): void
{
if (!$e->attributes->getNamedItem($attr) || !$e instanceof \DOMElement) {
return;
@@ -832,7 +813,7 @@ private function makeAbsoluteAttr($base, \DOMNode $e, $attr)
*
* @return false|string
*/
- private function makeAbsoluteStr($base, $url)
+ private function makeAbsoluteStr(string $base, string $url)
{
if (!$url) {
return false;
@@ -858,14 +839,8 @@ private function makeAbsoluteStr($base, $url)
* Truncate text.
*
* @see https://github.com/twigphp/Twig-extensions/blob/449e3c8a9ffad7c2479c7864557275a32b037499/lib/Twig/Extensions/Extension/Text.php#L40
- *
- * @param string $text
- * @param int $length
- * @param string $separator
- *
- * @return string
*/
- private function getExcerpt($text, $length = 250, $separator = ' …')
+ private function getExcerpt(string $text, int $length = 250, ?string $separator = ' …'): string
{
// use regex instead of strip_tags to left some spaces when removing tags
$text = preg_replace('#<[^>]+>#', ' ', (string) $text);
@@ -894,13 +869,8 @@ private function getExcerpt($text, $length = 250, $separator = ' …')
* (uses HTTP headers and HTML to find encoding).
*
* Adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
- *
- * @param string $html
- * @param array $headers All headers from the response
- *
- * @return string
*/
- private function convert2Utf8($html, array $headers = [])
+ private function convert2Utf8(string $html, array $headers = []): string
{
$contentType = isset($headers['content-type']) ? strtolower($headers['content-type']) : '';
@@ -912,7 +882,7 @@ private function convert2Utf8($html, array $headers = [])
// remove strange things
$html = str_replace('[>', '', $html);
- if (empty($contentType) || !preg_match_all('/([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $contentType, $match, \PREG_SET_ORDER)) {
+ if (!preg_match_all('/([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $contentType, $match, \PREG_SET_ORDER)) {
// error parsing the response
$this->logger->info('Could not find Content-Type header in HTTP response', ['headers' => $headers]);
} else {
@@ -1002,12 +972,8 @@ private function convert2Utf8($html, array $headers = [])
/**
* Try to cleanup XSS using htmLawed.
- *
- * @param string $html
- *
- * @return string
*/
- private function cleanupXss($html)
+ private function cleanupXss(string $html): string
{
if (false === $this->config['xss_filter']) {
return $html;
diff --git a/src/HttpClient/Plugin/CookiePlugin.php b/src/HttpClient/Plugin/CookiePlugin.php
index 7da4be90..1ba7af4e 100644
--- a/src/HttpClient/Plugin/CookiePlugin.php
+++ b/src/HttpClient/Plugin/CookiePlugin.php
@@ -95,7 +95,7 @@ private function createCookie(RequestInterface $request, string $setCookieHeader
{
$parts = array_map('trim', explode(';', $setCookieHeader));
- if (empty($parts) || !strpos($parts[0], '=')) {
+ if ('' === $parts[0] || false === strpos($parts[0], '=')) {
return null;
}
diff --git a/src/HttpClient/Plugin/History.php b/src/HttpClient/Plugin/History.php
index 746e097b..a3e244cc 100644
--- a/src/HttpClient/Plugin/History.php
+++ b/src/HttpClient/Plugin/History.php
@@ -18,29 +18,23 @@ class History implements Journal
*/
private $lastResponse;
- /**
- * @return RequestInterface|null
- */
- public function getLastRequest()
+ public function getLastRequest(): ?RequestInterface
{
return $this->lastRequest;
}
- /**
- * @return ResponseInterface|null
- */
- public function getLastResponse()
+ public function getLastResponse(): ?ResponseInterface
{
return $this->lastResponse;
}
- public function addSuccess(RequestInterface $request, ResponseInterface $response)
+ public function addSuccess(RequestInterface $request, ResponseInterface $response): void
{
$this->lastRequest = $request;
$this->lastResponse = $response;
}
- public function addFailure(RequestInterface $request, ClientExceptionInterface $exception)
+ public function addFailure(RequestInterface $request, ClientExceptionInterface $exception): void
{
}
}
diff --git a/src/Monolog/Handler/GrabyHandler.php b/src/Monolog/Handler/GrabyHandler.php
index 0d6b3019..a5d8494d 100644
--- a/src/Monolog/Handler/GrabyHandler.php
+++ b/src/Monolog/Handler/GrabyHandler.php
@@ -13,8 +13,8 @@
*/
class GrabyHandler extends AbstractProcessingHandler
{
- protected $records = [];
- protected $recordsByLevel = [];
+ protected array $records = [];
+ protected array $recordsByLevel = [];
public function __construct($level = Logger::DEBUG, $bubble = true)
{
@@ -24,18 +24,21 @@ public function __construct($level = Logger::DEBUG, $bubble = true)
$this->pushProcessor(new PsrLogMessageProcessor());
}
- public function getRecords()
+ public function getRecords(): array
{
return $this->records;
}
- public function clear()
+ public function clear(): void
{
$this->records = [];
$this->recordsByLevel = [];
}
- public function hasRecords($level)
+ /**
+ * @param string|int $level Logging level value or name
+ */
+ public function hasRecords($level): bool
{
return isset($this->recordsByLevel[$level]);
}
diff --git a/src/SiteConfig/ConfigBuilder.php b/src/SiteConfig/ConfigBuilder.php
index e7c8af4e..73e37311 100644
--- a/src/SiteConfig/ConfigBuilder.php
+++ b/src/SiteConfig/ConfigBuilder.php
@@ -11,12 +11,12 @@ class ConfigBuilder
{
/** @var LoggerInterface */
private $logger;
- private $config = [];
- private $configFiles = [];
- private $cache = [];
+ private array $config = [];
+ private array $configFiles = [];
+ private array $cache = [];
// Array for accepted headers for http_header()
- private $acceptedHeaders = [
+ private array $acceptedHeaders = [
'user-agent',
'referer',
'cookie',
@@ -24,7 +24,7 @@ class ConfigBuilder
];
// Array of accepted HTML tags for wrap_in()
- private $acceptedWrapInTags = [
+ private array $acceptedWrapInTags = [
'blockquote',
'p',
'div',
@@ -52,7 +52,7 @@ public function __construct($config = [], LoggerInterface $logger = null)
$this->loadConfigFiles();
}
- public function setLogger(LoggerInterface $logger)
+ public function setLogger(LoggerInterface $logger): void
{
$this->logger = $logger;
}
@@ -64,7 +64,7 @@ public function setLogger(LoggerInterface $logger)
* - If we add a new file after, it won't be loaded.
* - We'll need to manually reload config files.
*/
- public function loadConfigFiles()
+ public function loadConfigFiles(): void
{
$this->configFiles = Files::getFiles($this->config['site_config']);
}
@@ -75,7 +75,7 @@ public function loadConfigFiles()
* @param string $key Key for the cache
* @param SiteConfig $config Config to be cached
*/
- public function addToCache($key, SiteConfig $config)
+ public function addToCache($key, SiteConfig $config): void
{
$key = strtolower($key);
if ('www.' === substr($key, 0, 4)) {
@@ -419,20 +419,21 @@ public function parseLines(array $lines)
* @param SiteConfig $config Current config
* @param string $condition XPath condition
*/
- private function handleIfPageContainsCondition(SiteConfig $config, $condition)
+ private function handleIfPageContainsCondition(SiteConfig $config, string $condition): void
{
+ $rule = false;
if (!empty($config->single_page_link)) {
$rule = 'single_page_link';
} elseif (!empty($config->next_page_link)) {
$rule = 'next_page_link';
- } else {
- // no link found, we can't apply "if_page_contains"
- return;
}
- $key = end($config->$rule);
- reset($config->$rule);
+ // no link found, we can't apply "if_page_contains"
+ if ($rule) {
+ $key = end($config->$rule);
+ reset($config->$rule);
- $config->if_page_contains[$rule][$key] = (string) $condition;
+ $config->if_page_contains[$rule][$key] = (string) $condition;
+ }
}
}
diff --git a/tests/Extractor/ContentExtractorTest.php b/tests/Extractor/ContentExtractorTest.php
index fd4d8906..84f10b50 100644
--- a/tests/Extractor/ContentExtractorTest.php
+++ b/tests/Extractor/ContentExtractorTest.php
@@ -163,10 +163,10 @@ public function testProcessFindString(): void
$this->assertTrue($res, 'Extraction went well');
- $content_block = $contentExtractor->getContent();
+ $contentBlock = $contentExtractor->getContent();
- $this->assertStringContainsString('