From 9056106c2d52991931ba7fc1c41494697396d477 Mon Sep 17 00:00:00 2001 From: ORelio Date: Wed, 18 Oct 2023 19:13:33 +0200 Subject: [PATCH] [CNet] Rewrite bridge (#3764) (#3770) Bridge was broken. Full bridge rewrite using Sitemap as source. --- bridges/CNETBridge.php | 160 +++++++++++++++++++------------------- bridges/SitemapBridge.php | 6 +- 2 files changed, 83 insertions(+), 83 deletions(-) diff --git a/bridges/CNETBridge.php b/bridges/CNETBridge.php index 34442abda8f..4a63c84773c 100644 --- a/bridges/CNETBridge.php +++ b/bridges/CNETBridge.php @@ -1,6 +1,6 @@ 'list', 'values' => [ 'All articles' => '', - 'Apple' => 'apple', - 'Google' => 'google', - 'Microsoft' => 'tags-microsoft', - 'Computers' => 'topics-computers', - 'Mobile' => 'topics-mobile', - 'Sci-Tech' => 'topics-sci-tech', - 'Security' => 'topics-security', - 'Internet' => 'topics-internet', - 'Tech Industry' => 'topics-tech-industry' + 'Tech' => 'tech', + 'Money' => 'personal-finance', + 'Home' => 'home', + 'Wellness' => 'health', + 'Energy' => 'home/energy-and-utilities', + 'Deals' => 'deals', + 'Computing' => 'tech/computing', + 'Mobile' => 'tech/mobile', + 'Science' => 'science', + 'Services' => 'tech/services-and-software' ] - ] + ], + 'limit' => self::LIMIT ] ]; - private function cleanArticle($article_html) - { - $offset_p = strpos($article_html, '

'); - $offset_figure = strpos($article_html, '', '', $article_html); - $article_html = str_replace('', '', $article_html); - $article_html = StripWithDelimiters($article_html, ''); - $article_html = stripWithDelimiters($article_html, 'innertext, 'ImageObject","url":"', '"'); + if ($imageObject !== false) { + $enclosure = $imageObject; + } + } - if (is_null($article_thumbnail)) { - $article_thumbnail = extractFromDelimiters($element->innertext, 'find('div.c-shortcodeGallery') as $cleanup) { + $cleanup->outertext = ''; } - if (!empty($article_title) && !empty($article_uri) && strpos($article_uri, self::URI . 'news/') !== false) { - $article_html = getSimpleHTMLDOMCached($article_uri) or $article_html = null; - - if (!is_null($article_html)) { - if (empty($article_thumbnail)) { - $article_thumbnail = $article_html->find('div.originalImage', 0); - } - if (empty($article_thumbnail)) { - $article_thumbnail = $article_html->find('span.imageContainer', 0); - } - if (is_object($article_thumbnail)) { - $article_thumbnail = $article_thumbnail->find('img', 0)->src; - } - - $article_content .= trim( - $this->cleanArticle( - extractFromDelimiters( - $article_html, - 'find('figure') as $figure) { + $img = $figure->find('img', 0); + if ($img) { + $figure->outertext = $img->outertext; } + } + + $content = $content->innertext; + + if ($enclosure) { + $content = "

" . $content; + } + + if ($headline) { + $content = '

' . $headline->plaintext . '


' . $content; + } + + $item = []; + $item['uri'] = $article_uri; + $item['title'] = $title; + $item['author'] = $author; + $item['content'] = $content; - $item = []; - $item['uri'] = $article_uri; - $item['title'] = $article_title; - $item['author'] = $article_author; - $item['timestamp'] = $article_timestamp; - $item['enclosures'] = [$article_thumbnail]; - $item['content'] = $article_content; - $this->items[] = $item; + if (!is_null($date)) { + $item['timestamp'] = $date; } + + if (!is_null($enclosure)) { + $item['enclosures'] = [$enclosure]; + } + + $this->items[] = $item; } } } diff --git a/bridges/SitemapBridge.php b/bridges/SitemapBridge.php index bdf662eedd7..bbbb3e16616 100644 --- a/bridges/SitemapBridge.php +++ b/bridges/SitemapBridge.php @@ -131,7 +131,7 @@ protected function sitemapXmlToList($sitemap, $url_pattern = '', $limit = 0, $ke foreach ($sitemap->find('sitemap') as $nested_sitemap) { $url = $nested_sitemap->find('loc'); if (!empty($url)) { - $url = $url[0]->plaintext; + $url = trim($url[0]->plaintext); if (str_ends_with(strtolower($url), '.xml')) { $nested_sitemap_xml = $this->getSitemapXml($url, true); $nested_sitemap_links = $this->sitemapXmlToList($nested_sitemap_xml, $url_pattern, null, true); @@ -148,8 +148,8 @@ protected function sitemapXmlToList($sitemap, $url_pattern = '', $limit = 0, $ke $url = $item->find('loc'); $lastmod = $item->find('lastmod'); if (!empty($url) && !empty($lastmod)) { - $url = $url[0]->plaintext; - $lastmod = $lastmod[0]->plaintext; + $url = trim($url[0]->plaintext); + $lastmod = trim($lastmod[0]->plaintext); $timestamp = strtotime($lastmod); if (empty($url_pattern) || preg_match('/' . $url_pattern . '/', $url) === 1) { $links[$url] = $timestamp;