From d01c462ad514c327a82e92f269c139028de0a4f7 Mon Sep 17 00:00:00 2001 From: Dag Date: Mon, 29 Jan 2024 21:51:34 +0100 Subject: [PATCH] fix(FeedExpander): if parse fails, include offending url in exception message (#3938) Also some refactors --- actions/DetectAction.php | 28 +++++++++++--------- actions/FindfeedAction.php | 6 ++--- bridges/BMDSystemhausBlogBridge.php | 11 +++++--- bridges/FirefoxAddonsBridge.php | 40 ++++++++++++++--------------- lib/FeedExpander.php | 12 ++++++--- 5 files changed, 56 insertions(+), 41 deletions(-) diff --git a/actions/DetectAction.php b/actions/DetectAction.php index 99d299bb285..0c61f1b60d1 100644 --- a/actions/DetectAction.php +++ b/actions/DetectAction.php @@ -4,14 +4,14 @@ class DetectAction implements ActionInterface { public function execute(Request $request) { - $targetURL = $request->get('url'); + $url = $request->get('url'); $format = $request->get('format'); - if (!$targetURL) { - throw new \Exception('You must specify a url!'); + if (!$url) { + return new Response(render(__DIR__ . '/../templates/error.html.php', ['message' => 'You must specify a url'])); } if (!$format) { - throw new \Exception('You must specify a format!'); + return new Response(render(__DIR__ . '/../templates/error.html.php', ['message' => 'You must specify a format'])); } $bridgeFactory = new BridgeFactory(); @@ -23,19 +23,23 @@ public function execute(Request $request) $bridge = $bridgeFactory->create($bridgeClassName); - $bridgeParams = $bridge->detectParameters($targetURL); + $bridgeParams = $bridge->detectParameters($url); - if (is_null($bridgeParams)) { + if (!$bridgeParams) { continue; } - $bridgeParams['bridge'] = $bridgeClassName; - $bridgeParams['format'] = $format; - - $url = '?action=display&' . http_build_query($bridgeParams); - return new Response('', 301, ['location' => $url]); + $query = [ + 'action' => 'display', + 'bridge' => $bridgeClassName, + 'format' => $format, + ]; + $query = array_merge($query, $bridgeParams); + return new Response('', 301, ['location' => '?' . http_build_query($query)]); } - throw new \Exception('No bridge found for given URL: ' . $targetURL); + return new Response(render(__DIR__ . '/../templates/error.html.php', [ + 'message' => 'No bridge found for given URL: ' . $url, + ])); } } diff --git a/actions/FindfeedAction.php b/actions/FindfeedAction.php index cd0a0c741b7..94dc6b72b58 100644 --- a/actions/FindfeedAction.php +++ b/actions/FindfeedAction.php @@ -9,10 +9,10 @@ class FindfeedAction implements ActionInterface { public function execute(Request $request) { - $targetURL = $request->get('url'); + $url = $request->get('url'); $format = $request->get('format'); - if (!$targetURL) { + if (!$url) { return new Response('You must specify a url', 400); } if (!$format) { @@ -29,7 +29,7 @@ public function execute(Request $request) $bridge = $bridgeFactory->create($bridgeClassName); - $bridgeParams = $bridge->detectParameters($targetURL); + $bridgeParams = $bridge->detectParameters($url); if ($bridgeParams === null) { continue; diff --git a/bridges/BMDSystemhausBlogBridge.php b/bridges/BMDSystemhausBlogBridge.php index 327f165d6ba..50ed4639e9a 100644 --- a/bridges/BMDSystemhausBlogBridge.php +++ b/bridges/BMDSystemhausBlogBridge.php @@ -102,12 +102,17 @@ public function collectData() //----------------------------------------------------- public function detectParameters($url) { - $parsed_url = parse_url($url); - if ($parsed_url['host'] != 'www.bmd.com') { + try { + $parsedUrl = Url::fromString($url); + } catch (UrlException $e) { return null; } - $path = explode('/', $parsed_url['path']); + if ($parsedUrl->getHost() != 'www.bmd.com') { + return null; + } + + $path = explode('/', $parsedUrl->getPath()); if ($this->getURIbyCountry($path[1]) == '') { return null; diff --git a/bridges/FirefoxAddonsBridge.php b/bridges/FirefoxAddonsBridge.php index c8fb10f89d6..fcf2ca02f11 100644 --- a/bridges/FirefoxAddonsBridge.php +++ b/bridges/FirefoxAddonsBridge.php @@ -19,23 +19,6 @@ class FirefoxAddonsBridge extends BridgeAbstract const CACHE_TIMEOUT = 3600; private $feedName = ''; - private $releaseDateRegex = '/Released ([\w, ]+) - ([\w. ]+)/'; - private $xpiFileRegex = '/([A-Za-z0-9_.-]+)\.xpi$/'; - private $outgoingRegex = '/https:\/\/prod.outgoing\.prod\.webservices\.mozgcp\.net\/v1\/(?:[A-z0-9]+)\//'; - - private $urlRegex = '/addons\.mozilla\.org\/(?:[\w-]+\/)?firefox\/addon\/([\w-]+)/'; - - public function detectParameters($url) - { - $params = []; - - if (preg_match($this->urlRegex, $url, $matches)) { - $params['id'] = $matches[1]; - return $params; - } - - return null; - } public function collectData() { @@ -52,7 +35,8 @@ public function collectData() $item['uri'] = $this->getURI(); $item['author'] = $author; - if (preg_match($this->releaseDateRegex, $li->find('div.AddonVersionCard-fileInfo', 0)->plaintext, $match)) { + $releaseDateRegex = '/Released ([\w, ]+) - ([\w. ]+)/'; + if (preg_match($releaseDateRegex, $li->find('div.AddonVersionCard-fileInfo', 0)->plaintext, $match)) { $item['timestamp'] = $match[1]; $size = $match[2]; } @@ -68,7 +52,8 @@ public function collectData() $releaseNotes = $this->removeLinkRedirects($li->find('div.AddonVersionCard-releaseNotes', 0)); - if (preg_match($this->xpiFileRegex, $downloadlink, $match)) { + $xpiFileRegex = '/([A-Za-z0-9_.-]+)\.xpi$/'; + if (preg_match($xpiFileRegex, $downloadlink, $match)) { $xpiFilename = $match[0]; } @@ -110,10 +95,25 @@ public function getName() */ private function removeLinkRedirects($html) { + $outgoingRegex = '/https:\/\/prod.outgoing\.prod\.webservices\.mozgcp\.net\/v1\/(?:[A-z0-9]+)\//'; foreach ($html->find('a') as $a) { - $a->href = urldecode(preg_replace($this->outgoingRegex, '', $a->href)); + $a->href = urldecode(preg_replace($outgoingRegex, '', $a->href)); } return $html->innertext; } + + public function detectParameters($url) + { + $params = []; + + // Example: https://addons.mozilla.org/en-US/firefox/addon/ublock-origin + $pattern = '/addons\.mozilla\.org\/(?:[\w-]+\/)?firefox\/addon\/([\w-]+)/'; + if (preg_match($pattern, $url, $matches)) { + $params['id'] = $matches[1]; + return $params; + } + + return null; + } } diff --git a/lib/FeedExpander.php b/lib/FeedExpander.php index be3a8ca4e1d..abe964e147a 100644 --- a/lib/FeedExpander.php +++ b/lib/FeedExpander.php @@ -23,14 +23,20 @@ public function collectExpandableDatas(string $url, $maxItems = -1) throw new \Exception(sprintf('Unable to parse xml from `%s` because we got the empty string', $url), 10); } // prepare/massage the xml to make it more acceptable - $badStrings = [ + $problematicStrings = [ ' ', '»', '’', ]; - $xmlString = str_replace($badStrings, '', $xmlString); + $xmlString = str_replace($problematicStrings, '', $xmlString); + $feedParser = new FeedParser(); - $this->feed = $feedParser->parseFeed($xmlString); + try { + $this->feed = $feedParser->parseFeed($xmlString); + } catch (\Exception $e) { + throw new \Exception(sprintf('Failed to parse xml from %s: %s', $url, create_sane_exception_message($e))); + } + $items = array_slice($this->feed['items'], 0, $maxItems); // todo: extract parse logic out from FeedParser foreach ($items as $item) {