1
0
mirror of https://github.com/chylex/Nextcloud-News.git synced 2025-05-14 04:34:05 +02:00
Nextcloud-News/lib/Fetcher/FeedFetcher.php
Sean Molenaar 48a130d3c0 Allow titles to be null
Issue GH-867

Signed-off-by: Sean Molenaar <sean@seanmolenaar.eu>
2020-10-12 21:40:23 +02:00

347 lines
10 KiB
PHP
Executable File

<?php
/**
* Nextcloud - News
*
* This file is licensed under the Affero General Public License version 3 or
* later. See the COPYING file.
*
* @author Alessandro Cosentino <cosenal@gmail.com>
* @author Bernhard Posselt <dev@bernhard-posselt.com>
* @copyright 2012 Alessandro Cosentino
* @copyright 2012-2014 Bernhard Posselt
*/
namespace OCA\News\Fetcher;
use DateTime;
use Favicon\Favicon;
use FeedIo\Feed\ItemInterface;
use FeedIo\FeedInterface;
use FeedIo\FeedIo;
use Net_URL2;
use OCP\IL10N;
use OCA\News\Db\Item;
use OCA\News\Db\Feed;
use OCA\News\Utility\Time;
use OCA\News\Scraper\Scraper;
use Psr\Log\LoggerInterface;
use SimpleXMLElement;
class FeedFetcher implements IFeedFetcher
{
/**
* @var Favicon
*/
private $faviconFactory;
/**
* @var FeedIo
*/
private $reader;
/**
* @var Scraper
*/
private $scraper;
/**
* @var IL10N
*/
private $l10n;
/**
* @var Time
*/
private $time;
/**
* @var LoggerInterface
*/
private $logger;
public function __construct(
FeedIo $fetcher,
Favicon $favicon,
Scraper $scraper,
IL10N $l10n,
Time $time,
LoggerInterface $logger
) {
$this->reader = $fetcher;
$this->faviconFactory = $favicon;
$this->scraper = $scraper;
$this->l10n = $l10n;
$this->time = $time;
$this->logger = $logger;
}
/**
* This fetcher handles all the remaining urls therefore always returns true.
*
* @param string $url The URL to check
*
* @return bool
*/
public function canHandle(string $url): bool
{
return true;
}
/**
* Fetch a feed from remote
*
* @inheritdoc
*/
public function fetch(
string $url,
bool $favicon,
?string $lastModified,
bool $fullTextEnabled,
?string $user,
?string $password
): array {
$url2 = new Net_URL2($url);
if (!empty($user) && !empty(trim($user))) {
$url2->setUserinfo(urlencode($user), urlencode($password));
}
$url = $url2->getNormalizedURL();
$this->reader->resetFilters();
if (empty($lastModified) || !is_string($lastModified)) {
$resource = $this->reader->read($url);
} else {
$resource = $this->reader->readSince($url, new DateTime($lastModified));
}
$response = $resource->getResponse();
if (!$response->isModified()) {
$this->logger->debug('Feed {url} was not modified since last fetch. old: {old}, new: {new}', [
'url' => $url,
'old' => print_r($lastModified, true),
'new' => print_r($response->getLastModified(), true),
]);
return [null, []];
}
$location = $resource->getUrl();
$parsedFeed = $resource->getFeed();
$feed = $this->buildFeed(
$parsedFeed,
$url,
$favicon,
$location
);
$items = [];
$RTL = $this->determineRtl($parsedFeed);
$feedName = $parsedFeed->getTitle();
$this->logger->debug('Feed {url} was modified since last fetch. #{count} items', [
'url' => $url,
'count' => count($parsedFeed),
]);
foreach ($parsedFeed as $item) {
$body = null;
$currRTL = $RTL;
// Scrape the content if full-text is enabled and if the feed provides a URL
if ($fullTextEnabled) {
$itemLink = $item->getLink();
if ($itemLink !== null && $this->scraper->scrape($itemLink)) {
$body = $this->scraper->getContent();
$currRTL = $this->scraper->getRTL($currRTL);
}
}
$builtItem = $this->buildItem($item, $body, $currRTL);
$this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [
'title' => $builtItem->getTitle(),
'feed' => $feedName,
'datetime' => $builtItem->getLastModified(),
]);
$items[] = $builtItem;
}
return [$feed, $items];
}
/**
* Decode the string twice
*
* @param string $string String to decode
*
* @return string
*/
private function decodeTwice(string $string): string
{
return html_entity_decode(
html_entity_decode(
$string,
ENT_QUOTES | ENT_HTML5,
'UTF-8'
),
ENT_QUOTES | ENT_HTML5,
'UTF-8'
);
}
/**
* Check if a feed is RTL or not
*
* @param FeedInterface $parsedFeed The feed that was parsed
*
* @return bool
*/
protected function determineRtl(FeedInterface $parsedFeed): bool
{
$language = $parsedFeed->getLanguage();
$language = strtolower($language);
$rtl_languages = array(
'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
foreach ($rtl_languages as $prefix) {
if (strpos($language, $prefix) === 0) {
return true;
}
}
return false;
}
/**
* Build an item based on a feed.
*
* @param ItemInterface $parsedItem The item to use
* @param string|null $body Text of the item, if not provided use description from $parsedItem
* @param bool $RTL True if the feed is RTL (Right-to-left)
*
* @return Item
*/
protected function buildItem(ItemInterface $parsedItem, ?string $body = null, bool $RTL = false): Item
{
$item = new Item();
$item->setUnread(true);
$item->setUrl($parsedItem->getLink());
$item->setGuid($parsedItem->getPublicId());
$item->setGuidHash(md5($item->getGuid()));
$lastModified = $parsedItem->getLastModified() ?? new DateTime();
if ($parsedItem->getValue('pubDate') !== null) {
$pubDT = new DateTime($parsedItem->getValue('pubDate'));
} elseif ($parsedItem->getValue('published') !== null) {
$pubDT = new DateTime($parsedItem->getValue('published'));
} else {
$pubDT = $lastModified;
}
$item->setPubDate($pubDT->getTimestamp());
$item->setLastModified($lastModified->getTimestamp());
$item->setRtl($RTL);
// unescape content because angularjs helps against XSS
if ($parsedItem->getTitle() !== null) {
$item->setTitle($this->decodeTwice($parsedItem->getTitle()));
}
$author = $parsedItem->getAuthor();
if ($author !== null && $author->getName() !== null) {
$item->setAuthor($this->decodeTwice($author->getName()));
}
// Use description from feed if body is not provided (by a scraper)
if ($body === null) {
$body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription();
}
// purification is done in the service layer
$body = mb_convert_encoding(
$body,
'HTML-ENTITIES',
mb_detect_encoding($body)
);
if (strpos($body, 'CDATA') !== false) {
libxml_use_internal_errors(true);
$data = simplexml_load_string(
"<?xml version=\"1.0\"?><item>$body</item>",
SimpleXMLElement::class,
LIBXML_NOCDATA
);
if ($data !== false && libxml_get_last_error() === false) {
$body = (string) $data;
}
libxml_clear_errors();
}
$item->setBody($body);
if ($parsedItem->hasMedia()) {
// TODO: Fix multiple media support
foreach ($parsedItem->getMedias() as $media) {
if (!$item->isSupportedMime($media->getType())
&& !$media->getThumbnail()
&& !$media->getDescription()
) {
continue;
}
$item->setEnclosureMime($media->getType());
$item->setEnclosureLink($media->getUrl());
$item->setMediaThumbnail($media->getThumbnail());
if ($media->getDescription()) {
$description = str_replace("\n", "<br>", $media->getDescription());
$item->setMediaDescription($description);
}
}
}
$item->generateSearchIndex();
return $item;
}
/**
* Build a feed based on provided info
*
* @param FeedInterface $feed Feed to build from
* @param string $url URL to use
* @param boolean $getFavicon To get the favicon
* @param string $location String base URL
*
* @return Feed
*/
protected function buildFeed(FeedInterface $feed, string $url, bool $getFavicon, string $location): Feed
{
$newFeed = new Feed();
// unescape content because angularjs helps against XSS
if ($feed->getTitle() !== null) {
$title = strip_tags($this->decodeTwice($feed->getTitle()));
$newFeed->setTitle($title);
}
$newFeed->setUrl($url); // the url used to add the feed
$newFeed->setLocation($location); // the url where the feed was found
$newFeed->setLink($feed->getLink()); // <link> attribute in the feed
if ($feed->getLastModified() instanceof DateTime) {
$newFeed->setHttpLastModified($feed->getLastModified()->format(DateTime::RSS));
}
$newFeed->setAdded($this->time->getTime());
if (!$getFavicon) {
return $newFeed;
}
$favicon = $this->faviconFactory->get($url);
$newFeed->setFaviconLink($favicon);
return $newFeed;
}
}