1
0
mirror of https://github.com/chylex/Nextcloud-News.git synced 2025-08-18 20:25:00 +02:00
Files
.github
.tx
appinfo
bin
css
docs
img
js
l10n
lib
AppInfo
Command
Config
Controller
Cron
Db
Explore
Fetcher
Hooks
Migration
Plugin
Scraper
IScraper.php
Scraper.php
Search
Service
Settings
Utility
screenshots
templates
tests
.editorconfig
.gitignore
.mailmap
AUTHORS.md
CHANGELOG.md
CONTRIBUTING.md
COPYING
Makefile
README.md
composer.json
composer.lock
mkdocs.yml
phpstan.neon.dist
phpunit.xml
Daniel Rheinbay 0470bcb9ff Fix spelling of receive
recive => receive

Signed-off-by: Daniel Rheinbay <danielrheinbay@gmail.com>
2021-10-03 14:50:18 +02:00

106 lines
3.1 KiB
PHP

<?php
/**
* Nextcloud - News
*
* This file is licensed under the Affero General Public License version 3 or
* later. See the COPYING file.
*
* @author Gioele Falcetti <thegio.f@gmail.com>
* @copyright 2019 Gioele Falcetti
*/
namespace OCA\News\Scraper;
use andreskrey\Readability\Readability;
use andreskrey\Readability\Configuration;
use andreskrey\Readability\ParseException;
use Psr\Log\LoggerInterface;
class Scraper implements IScraper
{
private $logger;
private $config;
private $readability;
private $curl_opts;
public function __construct(LoggerInterface $logger)
{
$this->logger = $logger;
$this->config = new Configuration([
'FixRelativeURLs' => true,
'SummonCthulhu' => true, // Remove <script>
]);
$this->readability = null;
$this->curl_opts = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // do not return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
//CURLOPT_USERAGENT => "php-news", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
}
private function getHTTPContent(string $url): array
{
$handler = curl_init($url);
curl_setopt_array($handler, $this->curl_opts);
$content = curl_exec($handler);
$header = curl_getinfo($handler);
curl_close($handler);
// Update the url after the redirects has been followed
$url = $header['url'];
return array($content, $header['url']);
}
public function scrape(string $url): bool
{
list($content, $redirected_url) = $this->getHTTPContent($url);
if ($content === false) {
$this->logger->error('Unable to receive content from {url}', [
'url' => $url,
]);
$this->readability = null;
return false;
}
// Update URL used to convert relative URLs
$this->config->setOriginalURL($redirected_url);
$this->readability = new Readability($this->config);
try {
$this->readability->parse($content);
} catch (ParseException $e) {
$this->logger->error('Unable to parse content from {url}', [
'url' => $url,
]);
}
return true;
}
public function getContent(): ?string
{
if ($this->readability === null) {
return null;
}
return $this->readability->getContent();
}
public function getRTL(bool $default = false): bool
{
if ($this->readability === null) {
return $default;
}
$RTL = $this->readability->getDirection();
if ($RTL === null) {
return $default;
}
return $RTL === "rtl";
}
}