From 47f52b5912f442389e9b5a867044efc6f4680b7b Mon Sep 17 00:00:00 2001 From: ORelio Date: Mon, 9 Oct 2023 08:48:21 +0200 Subject: [PATCH] Add CSS Selector Feed Expander (#3732) * Add CSS Selector Feed Expander This bridge combines CssSelectorBridge with FeedExpander Allows expanding a feed using CSS selectors * Fix code linting --------- Co-authored-by: ORelio --- bridges/CssSelectorBridge.php | 9 ++- bridges/CssSelectorFeedExpanderBridge.php | 98 +++++++++++++++++++++++ bridges/SitemapBridge.php | 6 +- 3 files changed, 106 insertions(+), 7 deletions(-) create mode 100644 bridges/CssSelectorFeedExpanderBridge.php diff --git a/bridges/CssSelectorBridge.php b/bridges/CssSelectorBridge.php index dd8fe2289e8..f6ab8d15588 100644 --- a/bridges/CssSelectorBridge.php +++ b/bridges/CssSelectorBridge.php @@ -60,11 +60,12 @@ class CssSelectorBridge extends BridgeAbstract ] ]; - private $feedName = ''; + protected $feedName = ''; + protected $homepageUrl = ''; public function getURI() { - $url = $this->getInput('home_page'); + $url = $this->homepageUrl; if (empty($url)) { $url = parent::getURI(); } @@ -81,7 +82,7 @@ public function getName() public function collectData() { - $url = $this->getInput('home_page'); + $this->homepageUrl = $this->getInput('home_page'); $url_selector = $this->getInput('url_selector'); $url_pattern = $this->getInput('url_pattern'); $content_selector = $this->getInput('content_selector'); @@ -90,7 +91,7 @@ public function collectData() $discard_thumbnail = $this->getInput('discard_thumbnail'); $limit = $this->getInput('limit') ?? 10; - $html = defaultLinkTo(getSimpleHTMLDOM($url), $url); + $html = defaultLinkTo(getSimpleHTMLDOM($this->homepageUrl), $this->homepageUrl); $this->feedName = $this->titleCleanup($this->getPageTitle($html), $title_cleanup); $items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup); diff --git a/bridges/CssSelectorFeedExpanderBridge.php b/bridges/CssSelectorFeedExpanderBridge.php new file mode 100644 index 00000000000..7e1f630f1f9 --- /dev/null +++ b/bridges/CssSelectorFeedExpanderBridge.php @@ -0,0 +1,98 @@ + [ + 'name' => 'Feed: URL of truncated RSS feed', + 'exampleValue' => 'https://example.com/feed.xml', + 'required' => true + ], + 'content_selector' => [ + 'name' => 'Selector for each article content', + 'title' => <<. + Everything inside that element becomes feed item content. + EOT, + 'exampleValue' => 'article.content', + 'required' => true + ], + 'content_cleanup' => [ + 'name' => '[Optional] Content cleanup: List of items to remove', + 'title' => 'Selector for unnecessary elements to remove inside article contents.', + 'exampleValue' => 'div.ads, div.comments', + ], + 'dont_expand_metadata' => [ + 'name' => '[Optional] Don\'t expand metadata', + 'title' => "This bridge will attempt to fill missing fields using metadata from the webpage.\nCheck to disable.", + 'type' => 'checkbox', + ], + 'discard_thumbnail' => [ + 'name' => '[Optional] Discard thumbnail set by site author', + 'title' => 'Some sites set their logo as thumbnail for every article. Use this option to discard it.', + 'type' => 'checkbox', + ], + 'limit' => self::LIMIT + ] + ]; + + public function collectData() + { + $url = $this->getInput('feed'); + $content_selector = $this->getInput('content_selector'); + $content_cleanup = $this->getInput('content_cleanup'); + $dont_expand_metadata = $this->getInput('dont_expand_metadata'); + $discard_thumbnail = $this->getInput('discard_thumbnail'); + $limit = $this->getInput('limit'); + + $feed_expander = new CssSelectorFeedExpanderBridgeInternal(); + $items = $feed_expander->collectExpandableDatas($url)->getItems(); + + $this->homepageUrl = urljoin($url, '/'); + $this->feedName = $feed_expander->getName(); + + foreach ($items as $item_from_feed) { + $item_expanded = $this->expandEntryWithSelector( + $item_from_feed['uri'], + $content_selector, + $content_cleanup + ); + + if ($dont_expand_metadata) { + // Take feed item, only replace content from expanded data + $content = $item_expanded['content']; + $item_expanded = $item_from_feed; + $item_expanded['content'] = $content; + } else { + // Take expanded item, but give priority to metadata already in source item + foreach ($item_from_feed as $field => $val) { + if ($field !== 'content') { + $item_expanded[$field] = $val; + } + } + } + + if ($discard_thumbnail && isset($item_expanded['enclosures'])) { + unset($item_expanded['enclosures']); + } + + $this->items[] = $item_expanded; + } + } +} diff --git a/bridges/SitemapBridge.php b/bridges/SitemapBridge.php index 78526e6eb77..bdf662eedd7 100644 --- a/bridges/SitemapBridge.php +++ b/bridges/SitemapBridge.php @@ -64,7 +64,7 @@ class SitemapBridge extends CssSelectorBridge public function collectData() { - $url = $this->getInput('home_page'); + $this->homepageUrl = $this->getInput('home_page'); $url_pattern = $this->getInput('url_pattern'); $content_selector = $this->getInput('content_selector'); $content_cleanup = $this->getInput('content_cleanup'); @@ -73,8 +73,8 @@ public function collectData() $discard_thumbnail = $this->getInput('discard_thumbnail'); $limit = $this->getInput('limit'); - $this->feedName = $this->titleCleanup($this->getPageTitle($url), $title_cleanup); - $sitemap_url = empty($site_map) ? $url : $site_map; + $this->feedName = $this->titleCleanup($this->getPageTitle($this->homepageUrl), $title_cleanup); + $sitemap_url = empty($site_map) ? $this->homepageUrl : $site_map; $sitemap_xml = $this->getSitemapXml($sitemap_url, !empty($site_map)); $links = $this->sitemapXmlToList($sitemap_xml, $url_pattern, empty($limit) ? 10 : $limit);