diff --git a/.github/workflows/coding-standards.yml b/.github/workflows/coding-standards.yml index 5d8714c4..589bb9cc 100644 --- a/.github/workflows/coding-standards.yml +++ b/.github/workflows/coding-standards.yml @@ -3,10 +3,12 @@ name: "CS" on: pull_request: branches: - - master + - "master" + - "2.x" push: branches: - - master + - "master" + - "2.x" jobs: coding-standards: @@ -16,11 +18,11 @@ jobs: strategy: matrix: php: - - "7.3" + - "7.4" steps: - name: "Checkout" - uses: "actions/checkout@v2" + uses: "actions/checkout@v4" - name: "Install PHP" uses: "shivammathur/setup-php@v2" @@ -33,7 +35,7 @@ jobs: COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v1" + uses: "ramsey/composer-install@v2" - name: "Run PHP CS Fixer" run: "php vendor/bin/php-cs-fixer fix --verbose --dry-run --format=checkstyle | cs2pr" diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml index a59ecd7c..7d08d8d3 100644 --- a/.github/workflows/continuous-integration.yml +++ b/.github/workflows/continuous-integration.yml @@ -4,9 +4,11 @@ on: pull_request: branches: - "master" + - "2.x" push: branches: - "master" + - "2.x" env: fail-fast: true @@ -24,10 +26,13 @@ jobs: - "7.3" - "7.4" - "8.0" + - "8.1" + - "8.2" + - "8.3" steps: - name: "Checkout" - uses: "actions/checkout@v2" + uses: "actions/checkout@v4" with: fetch-depth: 2 @@ -43,7 +48,7 @@ jobs: COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v1" + uses: "ramsey/composer-install@v2" - name: "Setup logs" run: "mkdir -p build/logs" @@ -62,7 +67,7 @@ jobs: steps: - name: "Checkout" - uses: "actions/checkout@v2" + uses: "actions/checkout@v4" with: fetch-depth: 2 @@ -78,7 +83,7 @@ jobs: COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v1" + uses: "ramsey/composer-install@v2" - name: "Setup logs" run: "mkdir -p build/logs" @@ -108,7 +113,7 @@ jobs: steps: - name: "Checkout" - uses: "actions/checkout@v2" + uses: "actions/checkout@v4" with: fetch-depth: 2 @@ -124,7 +129,7 @@ jobs: COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v1" + uses: "ramsey/composer-install@v2" with: dependency-versions: "lowest" @@ -145,7 +150,7 @@ jobs: steps: - name: "Checkout" - uses: "actions/checkout@v2" + uses: "actions/checkout@v4" with: fetch-depth: 2 @@ -161,7 +166,7 @@ jobs: COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v1" + uses: "ramsey/composer-install@v2" - name: "Setup adapter: Guzzle 5" run: | @@ -185,7 +190,7 @@ jobs: steps: - name: "Checkout" - uses: "actions/checkout@v2" + uses: "actions/checkout@v4" with: fetch-depth: 2 @@ -201,7 +206,7 @@ jobs: COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v1" + uses: "ramsey/composer-install@v2" - name: "Setup adapter: Guzzle 7" run: | @@ -225,7 +230,7 @@ jobs: steps: - name: "Checkout" - uses: "actions/checkout@v2" + uses: "actions/checkout@v4" with: fetch-depth: 2 @@ -241,13 +246,12 @@ jobs: COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v1" + uses: "ramsey/composer-install@v2" - name: "Setup adapter: cURL" run: | composer remove php-http/guzzle6-adapter --dev -n composer require php-http/curl-client --dev -n - composer require zendframework/zend-diactoros --dev -n - name: "Setup logs" run: "mkdir -p build/logs" @@ -266,7 +270,7 @@ jobs: steps: - name: "Checkout" - uses: "actions/checkout@v2" + uses: "actions/checkout@v4" with: fetch-depth: 2 @@ -282,7 +286,7 @@ jobs: COMPOSER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: "Install dependencies with Composer" - uses: "ramsey/composer-install@v1" + uses: "ramsey/composer-install@v2" - name: "Setup logs" run: "mkdir -p build/logs" diff --git a/composer.json b/composer.json index bfa5f0ad..7c61e743 100644 --- a/composer.json +++ b/composer.json @@ -19,18 +19,18 @@ "ext-curl": "*", "ext-tidy": "*", "fossar/htmlawed": "^1.2.7", - "http-interop/http-factory-guzzle": "^1.0", - "j0k3r/graby-site-config": "^1.0.110", + "http-interop/http-factory-guzzle": "^1.1", + "j0k3r/graby-site-config": "^1.0.181", "j0k3r/httplug-ssrf-plugin": "^2.0", - "j0k3r/php-readability": "^1.2.3", + "j0k3r/php-readability": "^1.2.10", "monolog/monolog": "^1.18.0|^2.0", - "php-http/client-common": "^2.3", - "php-http/discovery": "^1.12", - "php-http/httplug": "^2.2", - "php-http/message": "^1.9", - "simplepie/simplepie": "^1.5", - "smalot/pdfparser": "^1.0", - "symfony/options-resolver": "^3.4|^4.4|^5.3|^6.0", + "php-http/client-common": "^2.7", + "php-http/discovery": "^1.19", + "php-http/httplug": "^2.4", + "php-http/message": "^1.14", + "simplepie/simplepie": "^1.7", + "smalot/pdfparser": "^1.1", + "symfony/options-resolver": "^3.4|^4.4|^5.3|^6.0|^7.0", "true/punycode": "^2.1", "guzzlehttp/psr7": "^1.5.0|^2.0" }, @@ -43,7 +43,7 @@ "phpstan/phpstan": "^0.12", "phpstan/phpstan-deprecation-rules": "^0.12", "phpstan/phpstan-phpunit": "^0.12", - "symfony/phpunit-bridge": "^5.3" + "symfony/phpunit-bridge": "^6.4.1" }, "extra": { "branch-alias": { @@ -61,6 +61,10 @@ } }, "config": { - "sort-packages": true + "sort-packages": true, + "allow-plugins": { + "php-http/discovery": true, + "phpstan/extension-installer": true + } } } diff --git a/phpunit.xml b/phpunit.xml index f3063b5a..7b33c79f 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -38,6 +38,10 @@ + + + + diff --git a/src/Extractor/ContentExtractor.php b/src/Extractor/ContentExtractor.php index 5f09582e..7875d38f 100644 --- a/src/Extractor/ContentExtractor.php +++ b/src/Extractor/ContentExtractor.php @@ -21,14 +21,14 @@ class ContentExtractor private $xpath; private $html; private $config; - private $siteConfig = null; - private $title = null; - private $language = null; + private $siteConfig; + private $title; + private $language; private $authors = []; - private $body = null; - private $image = null; + private $body; + private $image; private $nativeAd = false; - private $date = null; + private $date; private $success = false; private $nextPageUrl; /** @var LoggerInterface */ diff --git a/src/Extractor/HttpClient.php b/src/Extractor/HttpClient.php index e1f774a6..3f509d91 100644 --- a/src/Extractor/HttpClient.php +++ b/src/Extractor/HttpClient.php @@ -13,8 +13,8 @@ use Http\Client\Common\Plugin\RedirectPlugin; use Http\Client\Common\PluginClient; use Http\Client\Exception\TransferException; -use Http\Client\HttpClient as Client; use Http\Discovery\Psr17FactoryDiscovery; +use Psr\Http\Client\ClientInterface; use Psr\Http\Message\ResponseInterface; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; @@ -42,11 +42,7 @@ class HttpClient */ private $responseHistory; - /** - * @param Client $client Http client - * @param array $config - */ - public function __construct(Client $client, $config = [], LoggerInterface $logger = null) + public function __construct(ClientInterface $client, $config = [], LoggerInterface $logger = null) { $resolver = new OptionsResolver(); $resolver->setDefaults([ @@ -254,7 +250,7 @@ public function fetch($url, $skipTypeVerification = false, $httpHeader = []) // check for // for AJAX sites, e.g. Blogger with its dynamic views templates. // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (false === strpos($effectiveUrl, '_escaped_fragment_')) { + if (!str_contains($effectiveUrl, '_escaped_fragment_')) { $redirectURL = $this->getMetaRefreshURL($effectiveUrl, $body) ?: $this->getUglyURL($effectiveUrl, $body); if (false !== $redirectURL) { @@ -296,7 +292,7 @@ private function cleanupUrl($url) { // rewrite part of urls to something more readable foreach ($this->config['rewrite_url'] as $find => $action) { - if (false !== strpos($url, $find) && \is_array($action)) { + if (str_contains($url, $find) && \is_array($action)) { $url = strtr($url, $action); } } diff --git a/src/Graby.php b/src/Graby.php index 4b0e9200..13bd64f2 100644 --- a/src/Graby.php +++ b/src/Graby.php @@ -9,11 +9,11 @@ use GuzzleHttp\Psr7\Uri; use GuzzleHttp\Psr7\UriResolver; use Http\Client\Common\PluginClient; -use Http\Client\HttpClient as Client; -use Http\Discovery\HttpClientDiscovery; +use Http\Discovery\Psr18ClientDiscovery; use Http\Message\CookieJar; use Monolog\Handler\StreamHandler; use Monolog\Logger; +use Psr\Http\Client\ClientInterface; use Psr\Log\LoggerInterface; use Psr\Log\NullLogger; use Readability\Readability; @@ -34,21 +34,17 @@ class Graby private $config = []; - private $httpClient = null; - private $extractor = null; + private $httpClient; + private $extractor; /** @var ConfigBuilder */ private $configBuilder; private $punycode; private $imgNoReferrer = false; - private $prefetchedContent = null; + private $prefetchedContent; - /** - * @param array $config - * @param Client|null $client Http client - */ - public function __construct($config = [], Client $client = null, ConfigBuilder $configBuilder = null) + public function __construct($config = [], ClientInterface $client = null, ConfigBuilder $configBuilder = null) { $resolver = new OptionsResolver(); $resolver->setDefaults([ @@ -118,7 +114,7 @@ public function __construct($config = [], Client $client = null, ConfigBuilder $ ); $this->httpClient = new HttpClient( - $client ?: new PluginClient(HttpClientDiscovery::find(), [new CookiePlugin(new CookieJar())]), + $client ?: new PluginClient(Psr18ClientDiscovery::find(), [new CookiePlugin(new CookieJar())]), $this->config['http_client'], $this->logger ); @@ -150,8 +146,6 @@ public function reloadConfigFiles() * Return a config. * * @param string $key - * - * @return mixed */ public function getConfig($key) { @@ -226,7 +220,7 @@ public function cleanupHtml($contentBlock, $url) } // footnotes - if ('footnotes' === $this->config['content_links'] && false === strpos($url, 'wikipedia.org')) { + if ('footnotes' === $this->config['content_links'] && !str_contains($url, 'wikipedia.org')) { $this->extractor->readability->addFootnotes($contentBlock); } @@ -343,13 +337,6 @@ private function doFetchContent($url) $this->logger->debug('HTML after regex empty nodes stripping', ['html' => $html]); - // some non utf8 enconding might be breaking after converting to utf8 - // when it happen the string (usually) starts with this character - // in that case, we'll take the default response instead of the utf8 forced one - if (0 === strpos(utf8_encode($response['body']), 'ÿþ')) { - $html = $response['body']; - } - // check site config for single page URL - fetch it if found $isSinglePage = false; if ($this->config['singlepage'] && null === $this->prefetchedContent && ($singlePageResponse = $this->getSinglePage($html, $effectiveUrl))) { diff --git a/src/HttpClient/Plugin/CookiePlugin.php b/src/HttpClient/Plugin/CookiePlugin.php index 7da4be90..a92aa9e7 100644 --- a/src/HttpClient/Plugin/CookiePlugin.php +++ b/src/HttpClient/Plugin/CookiePlugin.php @@ -31,9 +31,6 @@ public function __construct(CookieJar $cookieJar) $this->cookieJar = $cookieJar; } - /** - * {@inheritdoc} - */ public function handleRequest(RequestInterface $request, callable $next, callable $first): Promise { $cookies = []; diff --git a/src/SiteConfig/ConfigBuilder.php b/src/SiteConfig/ConfigBuilder.php index 577103f4..ae527730 100644 --- a/src/SiteConfig/ConfigBuilder.php +++ b/src/SiteConfig/ConfigBuilder.php @@ -206,6 +206,7 @@ public function buildForHost($host, $addToCache = true) * @return false|SiteConfig * * @deprecated Use either buildForHost() / buildFromUrl() for the merged config or loadSiteConfig() to get the config for a site + * * @codeCoverageIgnore */ public function build($host, $exactHostMatch = false) diff --git a/src/SiteConfig/SiteConfig.php b/src/SiteConfig/SiteConfig.php index 65f63c23..4ec84dfe 100644 --- a/src/SiteConfig/SiteConfig.php +++ b/src/SiteConfig/SiteConfig.php @@ -54,7 +54,7 @@ class SiteConfig * * @var ?string */ - public $src_lazy_load_attr = null; + public $src_lazy_load_attr; /** * Strip elements which contain these strings (0 or more) in the id or class attribute. @@ -89,7 +89,7 @@ class SiteConfig * * @var ?bool */ - public $tidy = null; + public $tidy; /** * Autodetect title/body if xpath expressions fail to produce results. @@ -103,14 +103,14 @@ class SiteConfig * * @var ?bool */ - public $autodetect_on_failure = null; + public $autodetect_on_failure; /** * Clean up content block - attempt to remove elements that appear to be superfluous. * * @var ?bool */ - public $prune = null; + public $prune; /** * Test URL - if present, can be used to test the config above. @@ -149,7 +149,7 @@ class SiteConfig * * @var ?string */ - public $parser = null; + public $parser; /** * Strings to search for in HTML before processing begins (used with $replace_string). @@ -170,7 +170,7 @@ class SiteConfig * * @var ?string */ - public $cache_key = null; + public $cache_key; /** * If fetching the site's content requires to authentify. diff --git a/tests/Extractor/ContentExtractorTest.php b/tests/Extractor/ContentExtractorTest.php index fd4d8906..e64c885e 100644 --- a/tests/Extractor/ContentExtractorTest.php +++ b/tests/Extractor/ContentExtractorTest.php @@ -847,7 +847,7 @@ public function testLogMessage(): void $this->assertSame('Trying {pattern} for language', $records[4]['message']); $this->assertSame('Trying {pattern} for language', $records[5]['message']); $this->assertSame('Using Readability', $records[6]['message']); - $this->assertSame('Date is bad (strtotime failed): {date}', $records[7]['message']); + $this->assertSame('Date is bad (wrong year): {date}', $records[7]['message']); $this->assertSame('Attempting to parse HTML with {parser}', $records[9]['message']); } diff --git a/tests/GrabyFunctionalTest.php b/tests/GrabyFunctionalTest.php index 74357cef..b49e2edb 100644 --- a/tests/GrabyFunctionalTest.php +++ b/tests/GrabyFunctionalTest.php @@ -163,7 +163,6 @@ public function dataWithAccent(): array return [ // ['http://pérotin.com/post/2015/08/31/Le-cadran-solaire-amoureux'], ['https://en.wikipedia.org/wiki/Café'], - ['http://www.atterres.org/article/budget-2016-les-10-méprises-libérales-du-gouvernement'], ]; } diff --git a/tests/GrabyTest.php b/tests/GrabyTest.php index 1e3c6dfd..e8acbc94 100644 --- a/tests/GrabyTest.php +++ b/tests/GrabyTest.php @@ -413,6 +413,7 @@ public function dataForSinglePage(): array /** * @group dns-sensitive + * * @dataProvider dataForSinglePage */ public function testSinglePage(string $url, string $expectedUrl, string $singlePageUrl): void diff --git a/tests/SiteConfig/ConfigBuilderTest.php b/tests/SiteConfig/ConfigBuilderTest.php index e34e9ef2..10c43557 100644 --- a/tests/SiteConfig/ConfigBuilderTest.php +++ b/tests/SiteConfig/ConfigBuilderTest.php @@ -181,7 +181,7 @@ public function dataForBuild(): array /** * @dataProvider dataForBuild */ - public function testBuildSiteConfig(string $host, bool $expectedRes, ?string $matchedHost = null): void + public function testBuildSiteConfig(string $host, bool $expectedRes, string $matchedHost = null): void { $configBuilder = new ConfigBuilder([ 'site_config' => [__DIR__ . '/../fixtures/site_config'], diff --git a/tests/fixtures/sites/blogger.test b/tests/fixtures/sites/blogger.test index 141c0ff8..0c938931 100644 --- a/tests/fixtures/sites/blogger.test +++ b/tests/fixtures/sites/blogger.test @@ -3285,23 +3285,14 @@ flickering/performance issues. Note: put this last, else text glitches. -->
-----PARSED_CONTENT----- For the past few months I've been working on a "next-gen" MongoDB driver for PHP -- codename "phongo".
-The aim was to build a new PHP extension ontop of the mongoc and libbson libraries to reduce maintenance of the extension itself and focus more on providing the ecosystem with improved support and libraries. -

The new driver is available on PECL (called "mongodb", surprisingly enough). It doesn't include any of the bells and whistles found in the previous "mongo" driver. It doesn't include any `group` or `count` command helpers, and you won't find any Collection or Database objects; however, it really doesn't need any of these things.

All it has is simplicity. You can execute a command of your choosing. You can execute a query. You can execute a write. That's really it (as with everything, there are certain exceptions). Oh, it is also very, very fast.

-

What's the Point?

-Most developers interact with databases these days through their preferred framework abstraction layer. That layer often has its own way of providing the bells and whistles irrespective of the driver's functionality -- it seldom matters what helper methods the driver offers, since these libraries can call MongoDB::command() directly. Besides, upgrading the driver just to get a new command helper, which is easily implemented in userland PHP, is just weird. -

For our new "mongodb" extension, the driver only implements the most essential features for interacting with a MongoDB server:

-
  1. Server Discovery and Monitoring
  2. +The aim was to build a new PHP extension ontop of the mongoc and libbson libraries to reduce maintenance of the extension itself and focus more on providing the ecosystem with improved support and libraries.

    The new driver is available on PECL (called "mongodb", surprisingly enough). It doesn't include any of the bells and whistles found in the previous "mongo" driver. It doesn't include any `group` or `count` command helpers, and you won't find any Collection or Database objects; however, it really doesn't need any of these things.

    All it has is simplicity. You can execute a command of your choosing. You can execute a query. You can execute a write. That's really it (as with everything, there are certain exceptions). Oh, it is also very, very fast.

    What's the Point?

    +Most developers interact with databases these days through their preferred framework abstraction layer. That layer often has its own way of providing the bells and whistles irrespective of the driver's functionality -- it seldom matters what helper methods the driver offers, since these libraries can call MongoDB::command() directly. Besides, upgrading the driver just to get a new command helper, which is easily implemented in userland PHP, is just weird.

    For our new "mongodb" extension, the driver only implements the most essential features for interacting with a MongoDB server: +

    1. Server Discovery and Monitoring
    2. Automatic Server Selection (with manual selection possible) using Read Preferences
    3. BSON [en|de]coding (to/from extended JSON, too)
    4. Implementing the MongoDB wire protocol

    Again, What's the Point?

    -Well, now we can provide you with a stable driver that really shouldn't change all that much, which means less extension upgrades. -

    Furthermore, it should be faster. The legacy driver, which dates back five years, had unfortunate design quirks that couldn't be fully resolved without a costly rewrite (e.g. the way MongoGridFS invokes MongoCollection methods internally). Creating a brand new, no-frills[1], and simple to use driver gives us a fresh starting point for the next five years.

    -

    MongoDB PHP Libraries

    -Of course, we aren't planning on leaving our users out to dry. Whether or not your framework of choice offers an amazing MongoDB abstraction layer, we do want to make it easy, simple, and natural for you to develop applications with MongoDB. -

    To that end, we are writing a PHP library on top of this new extension, which will have all of the frills, bells, and whistles you might expect. It implements the "Standard MongoDB Driver CRUD API" (among others) and we'll continue roll in new features into this library as needed. And because this library will be implemented in PHP, we expect to iterate on new features much more quickly than we were able to do with the legacy driver.

    -

    This won't be the only library we will be writing. There are also plans to develop a library to deal with MongoDB administrative tasks (e.g. creating users, reconfiguring wiredTiger nodes, tailing oplogs) and develop tools to introspect MongoDB clusters.

    -

    What do you think?

    +Well, now we can provide you with a stable driver that really shouldn't change all that much, which means less extension upgrades.

    Furthermore, it should be faster. The legacy driver, which dates back five years, had unfortunate design quirks that couldn't be fully resolved without a costly rewrite (e.g. the way MongoGridFS invokes MongoCollection methods internally). Creating a brand new, no-frills[1], and simple to use driver gives us a fresh starting point for the next five years.

    MongoDB PHP Libraries

    +Of course, we aren't planning on leaving our users out to dry. Whether or not your framework of choice offers an amazing MongoDB abstraction layer, we do want to make it easy, simple, and natural for you to develop applications with MongoDB.

    To that end, we are writing a PHP library on top of this new extension, which will have all of the frills, bells, and whistles you might expect. It implements the "Standard MongoDB Driver CRUD API" (among others) and we'll continue roll in new features into this library as needed. And because this library will be implemented in PHP, we expect to iterate on new features much more quickly than we were able to do with the legacy driver.

    This won't be the only library we will be writing. There are also plans to develop a library to deal with MongoDB administrative tasks (e.g. creating users, reconfiguring wiredTiger nodes, tailing oplogs) and develop tools to introspect MongoDB clusters.

    What do you think?

    The biggest question at this point is: "what else?" What is missing? What are the other pain points you've experienced as a MongoDB developer, that the driver or a library can help with?
    -We would love to hear your feedback! Check out the projects on GitHub (driver, library), and let us know what you think. -

    [1] Well.. Almost. There is one frilled feature I'm excited about -- we'll cover that later.

    \ No newline at end of file +We would love to hear your feedback! Check out the projects on GitHub (driver, library), and let us know what you think.

    [1] Well.. Almost. There is one frilled feature I'm excited about -- we'll cover that later.

    \ No newline at end of file