diff --git a/bridges/AutoPodcasterBridge.php b/bridges/AutoPodcasterBridge.php new file mode 100644 index 00000000000..3a46749be7d --- /dev/null +++ b/bridges/AutoPodcasterBridge.php @@ -0,0 +1,152 @@ + [ + 'url' => [ + 'name' => 'URL', + 'exampleValue' => 'https://lorem-rss.herokuapp.com/feed?unit=day', + 'required' => true, + ], + 'feed_only' => [ + 'name' => 'Only look at the content of the feed, don\'t check on the website', + 'type' => 'checkbox', + 'defaultValue' => 'checked', + 'required' => false, + ], + ], + ]; + + public function collectData() + { + if ( + $this->getInput('url') + && substr($this->getInput('url'), 0, strlen('http')) !== 'http' + ) { + // just in case someone find a way to access local files by playing with the url + throw new \Exception('The url parameter must either refer to http or https protocol.'); + } + $this->collectExpandableDatas($this->getURI()); + } + + protected function parseItem($item) + { + $dom = false; + if (!$this->getInput('feed_only')) { + $dom = getSimpleHTMLDOMCached($item['uri'], 86400 * 10); // 10d + // $dom will be false in case of errors + } + $audios = []; + if ($dom) { + /* 1st extraction method: by "audio" tag */ + $audios = array_merge($audios, $this->extractAudio($dom)); + + /* 2nd extraction method: by "iframe" tag */ + $audios = array_merge($audios, $this->extractIframeArchive($dom)); + } elseif ($item['content'] !== null) { + $item_dom = str_get_html($item['content']); + /* 1st extraction method: by "audio" tag */ + $audios = array_merge($audios, $this->extractAudio($item_dom)); + + /* 2nd extraction method: by "iframe" tag */ + $audios = array_merge($audios, $this->extractIframeArchive($item_dom)); + } + + if ($audios === []) { + return null; + } + + // This will actually overwrite any exiting enclosures + $item['enclosures'] = []; + + foreach ($audios as $audio) { + $item['enclosures'][] = $audio['sources'][0]; + } + + return $item; + } + + private function extractAudio($dom) + { + $audios = []; + foreach ($dom->find('audio') as $audioEl) { + $sources = []; + if ($audioEl->src !== false) { + $sources[] = $audioEl->src; + } + foreach ($audioEl->find('source') as $sourceEl) { + $sources[] = $sourceEl->src; + } + if ($sources) { + $audios[$sources[0]] = ['sources' => $sources]; + } + } + return $audios; + } + + /** + * Detects iframes pointing to https://archive.org/embed + */ + private function extractIframeArchive($dom) + { + $audios = []; + + foreach ($dom->find('iframe') as $iframeEl) { + if (strpos($iframeEl->src, 'https://archive.org/embed/') === 0) { + $listURL = preg_replace('/\/embed\//', '/details/', $iframeEl->src, 1) . '?output=json'; + $baseURL = preg_replace('/\/embed\//', '/download/', $iframeEl->src, 1); + + $json = getContents($listURL); + + $list = Json::decode($json, false); + $audios = []; + foreach ($list->files as $name => $data) { + if ( + $data->source === 'original' + && $this->isAudioFormat($data->format) + ) { + $audios[$baseURL . $name] = ['sources' => [$baseURL . $name]]; + } + } + foreach ($list->files as $name => $data) { + if ( + $data->source === 'derivative' + && $this->isAudioFormat($data->format) + && isset($audios[$baseURL . '/' . $data->original]) + ) { + $audios[$baseURL . '/' . $data->original]['sources'][] = $baseURL . $name; + } + } + } + } + + return $audios; + } + + private function isAudioFormat($formatString): bool + { + // TODO: str_contains and str_starts_with + return strpos($formatString, 'MP3') !== false || strpos($formatString, 'Ogg') === 0; + } + + public function getName() + { + if (!is_null($this->getInput('url'))) { + return self::NAME . ' : ' . $this->getInput('url'); + } + + return parent::getName(); + } + + public function getURI() + { + return $this->getInput('url') ?? parent::getURI(); + } +} +