mirror of
https://github.com/chylex/Nextcloud-News.git
synced 2025-08-18 20:25:00 +02:00
.github
.tx
appinfo
bin
css
docs
img
js
l10n
lib
AppInfo
Command
Config
Controller
Cron
Db
Explore
Fetcher
Hooks
Migration
Plugin
Scraper
IScraper.php
Scraper.php
Search
Service
Settings
Utility
screenshots
templates
tests
.editorconfig
.gitignore
.mailmap
AUTHORS.md
CHANGELOG.md
CONTRIBUTING.md
COPYING
Makefile
README.md
composer.json
composer.lock
mkdocs.yml
phpstan.neon.dist
phpunit.xml
106 lines
3.1 KiB
PHP
106 lines
3.1 KiB
PHP
<?php
|
|
/**
|
|
* Nextcloud - News
|
|
*
|
|
* This file is licensed under the Affero General Public License version 3 or
|
|
* later. See the COPYING file.
|
|
*
|
|
* @author Gioele Falcetti <thegio.f@gmail.com>
|
|
* @copyright 2019 Gioele Falcetti
|
|
*/
|
|
|
|
namespace OCA\News\Scraper;
|
|
|
|
use andreskrey\Readability\Readability;
|
|
use andreskrey\Readability\Configuration;
|
|
use andreskrey\Readability\ParseException;
|
|
use Psr\Log\LoggerInterface;
|
|
|
|
class Scraper implements IScraper
|
|
{
|
|
private $logger;
|
|
private $config;
|
|
private $readability;
|
|
private $curl_opts;
|
|
|
|
public function __construct(LoggerInterface $logger)
|
|
{
|
|
$this->logger = $logger;
|
|
$this->config = new Configuration([
|
|
'FixRelativeURLs' => true,
|
|
'SummonCthulhu' => true, // Remove <script>
|
|
]);
|
|
$this->readability = null;
|
|
|
|
$this->curl_opts = array(
|
|
CURLOPT_RETURNTRANSFER => true, // return web page
|
|
CURLOPT_HEADER => false, // do not return headers
|
|
CURLOPT_FOLLOWLOCATION => true, // follow redirects
|
|
//CURLOPT_USERAGENT => "php-news", // who am i
|
|
CURLOPT_AUTOREFERER => true, // set referer on redirect
|
|
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
|
|
CURLOPT_TIMEOUT => 120, // timeout on response
|
|
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
|
|
);
|
|
}
|
|
|
|
private function getHTTPContent(string $url): array
|
|
{
|
|
$handler = curl_init($url);
|
|
curl_setopt_array($handler, $this->curl_opts);
|
|
$content = curl_exec($handler);
|
|
$header = curl_getinfo($handler);
|
|
curl_close($handler);
|
|
|
|
// Update the url after the redirects has been followed
|
|
$url = $header['url'];
|
|
return array($content, $header['url']);
|
|
}
|
|
|
|
public function scrape(string $url): bool
|
|
{
|
|
list($content, $redirected_url) = $this->getHTTPContent($url);
|
|
if ($content === false) {
|
|
$this->logger->error('Unable to receive content from {url}', [
|
|
'url' => $url,
|
|
]);
|
|
$this->readability = null;
|
|
return false;
|
|
}
|
|
|
|
// Update URL used to convert relative URLs
|
|
$this->config->setOriginalURL($redirected_url);
|
|
$this->readability = new Readability($this->config);
|
|
|
|
try {
|
|
$this->readability->parse($content);
|
|
} catch (ParseException $e) {
|
|
$this->logger->error('Unable to parse content from {url}', [
|
|
'url' => $url,
|
|
]);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public function getContent(): ?string
|
|
{
|
|
if ($this->readability === null) {
|
|
return null;
|
|
}
|
|
return $this->readability->getContent();
|
|
}
|
|
|
|
public function getRTL(bool $default = false): bool
|
|
{
|
|
if ($this->readability === null) {
|
|
return $default;
|
|
}
|
|
|
|
$RTL = $this->readability->getDirection();
|
|
if ($RTL === null) {
|
|
return $default;
|
|
}
|
|
return $RTL === "rtl";
|
|
}
|
|
}
|