Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated to support Laravel 10.x and 11.x versions. #13

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
}
],
"require": {
"php": "^7.3|^8.0",
"laravel/framework": "^6.20.12||^7.30.3||^8.4||^9.2",
"php": ">=8.1",
"laravel/framework": "^9.2||^10.0||^11.0",
"guzzlehttp/guzzle": "^7.2",
"vdb/php-spider": "^v0.6.3",
"vdb/php-spider": "^v0.7.2",
"nesbot/carbon": "^2.41",
"spatie/robots-txt": "^1.0||^2.0"
},
Expand Down
31 changes: 18 additions & 13 deletions src/Commands/SitemapCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
use Spatie\Robots\Robots;
use Illuminate\Console\Command;
use VDB\Spider\Event\SpiderEvents;
use Symfony\Component\EventDispatcher\Event;
use VDB\Spider\QueueManager\InMemoryQueueManager;
use VDB\Spider\QueueManager\QueueManagerInterface;
use VDB\Spider\Filter\Prefetch\AllowedHostsFilter;
Expand All @@ -37,9 +36,9 @@ class SitemapCommand extends Command
/**
* Generate the sitemap
*
* @return void
* @return int
*/
public function handle()
public function handle(): int
{
// Crawl the site
$this->info('Starting site crawl...');
Expand All @@ -51,6 +50,8 @@ public function handle()

// Signal completion
$this->info('Sitemap generation completed.');

return Command::SUCCESS;
}

/**
Expand All @@ -59,7 +60,7 @@ public function handle()
* @param string $url
* @return array $resources
*/
protected function crawlWebsite($url)
protected function crawlWebsite(string $url): array
{
// Load the robots.txt from the site.
$robots_url = $url . '/robots.txt';
Expand All @@ -71,7 +72,7 @@ protected function crawlWebsite($url)

// Add a URI discoverer. Without it, the spider does nothing.
// In this case, we want <a> tags and the canonical link
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a|//link[@rel=\"canonical\"]"));
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a|//link[@rel=\"canonical\"]//a"));
$spider->getDiscovererSet()->addFilter(new AllowedHostsFilter([$url], true));

// Set limits
Expand All @@ -81,8 +82,8 @@ protected function crawlWebsite($url)
// Let's add something to enable us to stop the script
$spider->getDispatcher()->addListener(
SpiderEvents::SPIDER_CRAWL_USER_STOPPED,
function (Event $event) {
consoleOutput()->error("Crawl aborted.");
function () {
echo "Crawl aborted.";
exit();
}
);
Expand All @@ -104,8 +105,8 @@ function (Event $event) {
$this->comment("Failed: " . count($statsHandler->getFailed()));
$this->comment("Persisted: " . count($statsHandler->getPersisted()));

// Finally we could do some processing on the downloaded resources
// In this example, we will echo the title of all resources
// Finally, we could do some processing on the downloaded resources
// In this example we will echo the title of all resources
$this->comment("\nResources:");
$resources = [];
foreach ($spider->getDownloader()->getPersistenceHandler() as $resource) {
Expand All @@ -116,7 +117,10 @@ function (Event $event) {
// <meta name="robots" content="noindex, nofollow" />
$noindex = false;
if ($resource->getCrawler()->filterXpath('//meta[@name="robots"]')->count() > 0) {
$noindex = (strpos($resource->getCrawler()->filterXpath('//meta[@name="robots"]')->attr('content'), 'noindex') !== false);
$noindex = (str_contains(
$resource->getCrawler()->filterXpath('//meta[@name="robots"]')->attr('content'),
'noindex'
));

$this->info(sprintf(" - Skipping %s (on-page no-index)", $url));
}
Expand Down Expand Up @@ -160,10 +164,11 @@ function (Event $event) {
/**
* Write the sitemap as a file.
*
* @param array $resources
* @param array $resources
*
* @return void
**/
protected function writeSitemap($resources)
protected function writeSitemap(array $resources): void
{
// Prepare XML
$urlset = new SimpleXMLElement('<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.sitemaps.org/schemas/sitemap/0.9 https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"></urlset>');
Expand All @@ -190,7 +195,7 @@ protected function writeSitemap($resources)
$dom->loadXML($urlset->asXML());
$dom->formatOutput = true;

// Write file
// Write a file
try {
file_put_contents(public_path() . '/sitemap.xml', $dom->saveXML());
} catch (Exception $exception) {
Expand Down
27 changes: 16 additions & 11 deletions src/Handlers/StatsHandler.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,43 +15,48 @@
class StatsHandler implements EventSubscriberInterface
{
/** @var string */
protected $spiderId;
protected string $spiderId;

protected $persisted = array();
protected array $persisted = [];

protected $queued = array();
protected array $queued = [];

protected $filtered = array();
protected array $filtered = [];

protected $failed = array();
protected array $failed = [];

public static function getSubscribedEvents(): array
{
return array(
return [
SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH => 'addToFiltered',
SpiderEvents::SPIDER_CRAWL_FILTER_PREFETCH => 'addToFiltered',
SpiderEvents::SPIDER_CRAWL_POST_ENQUEUE => 'addToQueued',
SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED => 'addToPersisted',
SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST => 'addToFailed'
);
];
}

public function addToQueued(GenericEvent $event)
private function getSpiderId(): string
{
return $this->spiderId;
}

public function addToQueued(GenericEvent $event): void
{
$this->queued[] = $event->getArgument('uri');
}

public function addToPersisted(GenericEvent $event)
public function addToPersisted(GenericEvent $event): void
{
$this->persisted[] = $event->getArgument('uri');
}

public function addToFiltered(GenericEvent $event)
public function addToFiltered(GenericEvent $event): void
{
$this->filtered[] = $event->getArgument('uri');
}

public function addToFailed(GenericEvent $event)
public function addToFailed(GenericEvent $event): void
{
$this->failed[$event->getArgument('uri')->toString()] = $event->getArgument('message');
}
Expand Down