From cca4a97d9a45ccbb49c88e81877f4ef8b7c212bb Mon Sep 17 00:00:00 2001 From: Yanick Witschi Date: Thu, 9 Nov 2023 17:28:45 +0100 Subject: [PATCH] Version 1.6.0 (#34) * Updated to supported Symfony and Doctrine versions only * Implemented maxDurationInSeconds() * Updated Coding Style implementation * Updated to PHP 8 features using Rector <3 * Fixed missing docs * Update CI * Update CI to include PHP 8.3 --- .github/workflows/ci.yaml | 17 +- .php-cs-fixer.dist.php | 48 ------ README.md | 8 +- composer.json | 21 ++- phpinsights.php | 63 -------- src/BaseUriCollection.php | 8 +- src/CrawlUri.php | 60 +++---- src/Escargot.php | 191 ++++++++++++----------- src/Queue/DoctrineQueue.php | 105 ++++++------- src/Queue/InMemoryQueue.php | 13 +- src/Queue/LazyQueue.php | 31 ++-- src/Queue/QueueInterface.php | 6 +- src/Subscriber/HtmlCrawlerSubscriber.php | 18 ++- src/Subscriber/RobotsSubscriber.php | 59 ++++--- src/Subscriber/SubscriberInterface.php | 2 + src/Subscriber/Util.php | 2 +- src/SubscriberLogger.php | 23 +-- tests/EscargotTest.php | 112 ++++++++++--- tests/Fixtures/scenario6/_logs.txt | 1 + tests/Queue/DoctrineQueueTest.php | 19 ++- tests/Queue/LazyQueueTest.php | 19 ++- tests/Scenario/MockResponseFactory.php | 2 +- tests/Scenario/Scenario.php | 44 ++---- tests/Subscriber/UtilTest.php | 2 +- 24 files changed, 394 insertions(+), 480 deletions(-) delete mode 100644 .php-cs-fixer.dist.php delete mode 100644 phpinsights.php diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c12657a..222b41a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -18,15 +18,18 @@ jobs: - name: Setup PHP uses: shivammathur/setup-php@v2 with: - php-version: 7.4 + php-version: 8.2 coverage: none tools: php-cs-fixer - - name: Checkout - uses: actions/checkout@v2 - + - name: Checkout + uses: actions/checkout@v2 + + - name: Install the dependencies + run: composer install --no-interaction --no-suggest + - name: Run the CS fixer - run: php-cs-fixer fix + run: composer cs tests: name: PHP ${{ matrix.php }} @@ -34,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - php: [7.4, 8.0, 8.1, 8.2] + php: [8.1, 8.2, 8.3] steps: - name: Setup PHP uses: shivammathur/setup-php@v2 @@ -58,7 +61,7 @@ jobs: strategy: fail-fast: false matrix: - php: [7.4, 8.0, 8.1, 8.2] + php: [8.1, 8.2, 8.3] steps: - name: Setup PHP uses: shivammathur/setup-php@v2 diff --git a/.php-cs-fixer.dist.php b/.php-cs-fixer.dist.php deleted file mode 100644 index ef9f081..0000000 --- a/.php-cs-fixer.dist.php +++ /dev/null @@ -1,48 +0,0 @@ - -@license MIT -EOF; - -$finder = PhpCsFixer\Finder::create() - ->in([__DIR__.'/src', __DIR__.'/tests']) -; - -$config = new PhpCsFixer\Config(); -$config - ->setRiskyAllowed(true) - ->setRules([ - '@Symfony' => true, - '@Symfony:risky' => true, - 'array_syntax' => ['syntax' => 'short'], - 'combine_consecutive_unsets' => true, - 'declare_strict_types' => true, - 'general_phpdoc_annotation_remove' => true, - 'header_comment' => ['header' => $header], - 'heredoc_to_nowdoc' => true, - 'no_extra_blank_lines' => true, - 'no_unreachable_default_argument_value' => true, - 'no_useless_else' => true, - 'no_useless_return' => true, - 'no_superfluous_phpdoc_tags' => true, - 'ordered_class_elements' => true, - 'ordered_imports' => true, - 'php_unit_strict' => true, - 'phpdoc_add_missing_param_annotation' => true, - 'phpdoc_order' => true, - 'psr_autoloading' => true, - 'strict_comparison' => true, - 'strict_param' => true, - 'native_function_invocation' => ['include' => ['@compiler_optimized']], - 'void_return' => true, - ]) - ->setFinder($finder) -; - -return $config; diff --git a/README.md b/README.md index 89e6598..3b6abc2 100644 --- a/README.md +++ b/README.md @@ -211,7 +211,7 @@ There are `2` other interfaces which you might want to integrate but you don't h #### Tags Sometimes you may want to add meta information to any `CrawlUri` instance so you can let other subscribers decide -what they want to do with this information or it may be relevant during another request. +what they want to do with this information, or it may be relevant during another request. The `RobotsSubscriber` for instance, tags `CrawlUri` instances when they contained a `` in the body or the corresponding `X-Robots-Tag` header was set. All the links found on this URI are then not followed which happens during the next `shouldRequest()` call. @@ -438,6 +438,12 @@ There are different configurations you can apply to the `Escargot` instance: Returns a clone of the `Escargot` instance with a maximum total requests that are going to be executed. It can be useful if you have limited resources and only want to execute e.g. `100` requests in this run and continue later on. + +* `Escargot::withMaxDurationInSeconds(int $maxDurationInSeconds): Escargot` + + Returns a clone of the `Escargot` instance with a maximum total seconds Escargot is going to be running. It can be + useful if you have limited resources and only want to execute the crawl process for e.g. `30` seconds in this run + and continue later on. * `Escargot::withUserAgent(string $userAgent): Escargot` diff --git a/composer.json b/composer.json index 6e44b77..8b564f8 100644 --- a/composer.json +++ b/composer.json @@ -16,27 +16,32 @@ "source": "https://github.com/terminal42/escargot" }, "require": { - "php": "^7.4 || ^8.0", + "php": "^8.1", "ext-simplexml": "*", "nyholm/psr7": "^1.1", "psr/http-message": "^1.0 || ^2.0", "psr/log": "^1.1 || ^2.0 || ^3.0", - "symfony/dom-crawler": "^4.4 || ^5.0 || ^6.0", - "symfony/event-dispatcher": "^4.4 || ^5.0 || ^6.0", - "symfony/http-client": "^4.4 || ^5.0 || ^6.0", + "symfony/clock": "^6.2", + "symfony/dom-crawler": "^5.4 || ^6.0", + "symfony/event-dispatcher": "^5.4 || ^6.0", + "symfony/http-client": "^5.4 || ^6.0", + "terminal42/contao-build-tools": "@dev", "webignition/robots-txt-file": "^3.0" }, "require-dev": { - "doctrine/dbal": "^2.13 || ^3.0", - "symfony/finder": "^4.4 || ^5.0 || ^6.0", - "symfony/phpunit-bridge": "^5.1.8 || ^6.0", + "doctrine/dbal": "^3.6", + "symfony/finder": "^5.4|| ^6.0", + "symfony/phpunit-bridge": "^5.4 || ^6.0", "fig/log-test": "^1.0" }, "config": { "preferred-install": { "*": "dist" }, - "sort-packages": true + "sort-packages": true, + "allow-plugins": { + "terminal42/contao-build-tools": true + } }, "autoload": { "psr-4": { diff --git a/phpinsights.php b/phpinsights.php deleted file mode 100644 index 372a197..0000000 --- a/phpinsights.php +++ /dev/null @@ -1,63 +0,0 @@ - 'symfony', - - /* - |-------------------------------------------------------------------------- - | Configuration - |-------------------------------------------------------------------------- - | - | Here you may adjust all the various `Insights` that will be used by PHP - | Insights. You can either add, remove or configure `Insights`. Keep in - | mind, that all added `Insights` must belong to a specific `Metric`. - | - */ - - 'exclude' => [ - // 'path/to/directory-or-file' - ], - - 'add' => [ - // ExampleMetric::class => [ - // ExampleInsight::class, - // ] - ], - - 'remove' => [ - DisallowYodaComparisonSniff::class, - SuperfluousAbstractClassNamingSniff::class, - SuperfluousExceptionNamingSniff::class, - SuperfluousInterfaceNamingSniff::class, - SpaceAfterNotSniff::class - ], - - 'config' => [ - // ExampleInsight::class => [ - // 'key' => 'value', - // ], - ], - -]; diff --git a/src/BaseUriCollection.php b/src/BaseUriCollection.php index 2223d4e..5b323a3 100644 --- a/src/BaseUriCollection.php +++ b/src/BaseUriCollection.php @@ -19,7 +19,7 @@ final class BaseUriCollection implements \IteratorAggregate, \Countable /** * @var array */ - private $baseUris = []; + private array $baseUris = []; /** * @param array $baseUris @@ -80,17 +80,11 @@ public function all(): array return array_values($this->baseUris); } - /** - * {@inheritdoc} - */ public function getIterator(): \Traversable { return new \ArrayIterator($this->all()); } - /** - * {@inheritdoc} - */ public function count(): int { return \count($this->all()); diff --git a/src/CrawlUri.php b/src/CrawlUri.php index 97594f7..a75af57 100644 --- a/src/CrawlUri.php +++ b/src/CrawlUri.php @@ -14,43 +14,23 @@ use Psr\Http\Message\UriInterface; -final class CrawlUri +final class CrawlUri implements \Stringable { - /** - * @var UriInterface - */ - private $uri; - - /** - * @var int - */ - private $level; - - /** - * @var bool - */ - private $processed = false; - - /** - * @var bool - */ - private $wasMarkedProcessed = false; - - /** - * @var UriInterface|null - */ - private $foundOn = null; - - /** - * @var array - */ - private $tags = []; - - public function __construct(UriInterface $uri, int $level, bool $processed = false, ?UriInterface $foundOn = null) - { + private readonly UriInterface $uri; + + private bool $wasMarkedProcessed = false; + + private UriInterface|null $foundOn = null; + + private array $tags = []; + + public function __construct( + UriInterface $uri, + private readonly int $level, + private bool $processed = false, + UriInterface|null $foundOn = null, + ) { $this->uri = self::normalizeUri($uri); - $this->level = $level; - $this->processed = $processed; if (null !== $foundOn) { $this->foundOn = self::normalizeUri($foundOn); @@ -64,7 +44,7 @@ public function __toString(): string $this->getLevel(), $this->isProcessed() ? 'yes' : 'no', (string) ($this->getFoundOn() ?: 'root'), - $this->getTags() ? implode(', ', $this->getTags()) : 'none' + $this->getTags() ? implode(', ', $this->getTags()) : 'none', ); } @@ -96,7 +76,7 @@ public function wasMarkedProcessed(): bool return $this->wasMarkedProcessed; } - public function getFoundOn(): ?UriInterface + public function getFoundOn(): UriInterface|null { return $this->foundOn; } @@ -108,7 +88,7 @@ public function getTags(): array public function addTag(string $tag): self { - if (false !== strpos($tag, ',')) { + if (str_contains($tag, ',')) { throw new \InvalidArgumentException('Cannot use commas in tags.'); } @@ -139,8 +119,6 @@ public static function normalizeUri(UriInterface $uri): UriInterface $uri = $uri->withPath('/'); } - $uri = $uri->withFragment(''); - - return $uri; + return $uri->withFragment(''); } } diff --git a/src/Escargot.php b/src/Escargot.php index 1e7fa6a..439e1d1 100644 --- a/src/Escargot.php +++ b/src/Escargot.php @@ -12,11 +12,12 @@ namespace Terminal42\Escargot; -use Nyholm\Psr7\Uri; use Psr\Http\Message\UriInterface; use Psr\Log\LoggerAwareInterface; use Psr\Log\LoggerInterface; use Psr\Log\LogLevel; +use Symfony\Component\Clock\ClockInterface; +use Symfony\Component\Clock\NativeClock; use Symfony\Component\HttpClient\HttpClient; use Symfony\Contracts\HttpClient\ChunkInterface; use Symfony\Contracts\HttpClient\Exception\ExceptionInterface; @@ -36,101 +37,73 @@ final class Escargot { private const DEFAULT_USER_AGENT = 'terminal42/escargot'; - /** - * @var QueueInterface - */ - private $queue; - - /** - * @var string - */ - private $jobId; - - /** - * @var BaseUriCollection - */ - private $baseUris; + private ClockInterface $clock; - /** - * @var HttpClientInterface|null - */ - private $client; + private HttpClientInterface|null $client = null; - /** - * @var LoggerInterface|null - */ - private $logger; + private LoggerInterface|null $logger = null; /** - * @var SubscriberInterface[] + * @var array */ - private $subscribers = []; + private array $subscribers = []; - /** - * @var string - */ - private $userAgent; + private string $userAgent; /** * Maximum number of requests * Escargot is going to * execute. * 0 means no limit. + */ + private int $maxRequests = 0; + + /** + * Maximum number of duration in seconds + * Escargot is going to work on requests. * - * @var int + * 0 means no limit. */ - private $maxRequests = 0; + private int $maxDurationInSeconds = 0; /** * Request delay in microseconds. * 0 means no delay. - * - * @var int */ - private $requestDelay = 0; + private int $requestDelay = 0; /** * Maximum concurrent requests * that are being sent. - * - * @var int */ - private $concurrency = 10; + private int $concurrency = 10; /** * Maximum depth Escargot * is going to crawl. * 0 means no limit. - * - * @var int */ - private $maxDepth = 0; + private int $maxDepth = 0; - /** - * @var int - */ - private $requestsSent = 0; + private int $requestsSent = 0; - /** - * @var array - */ - private $runningRequests = []; + private array $runningRequests = []; /** * Keeps track of all the decisions * for all the subscribers for * every CrawlUri instance. - * - * @var array */ - private $decisionMap = ['shouldRequest' => [], 'needsContent' => []]; + private array $decisionMap = ['shouldRequest' => [], 'needsContent' => []]; - private function __construct(QueueInterface $queue, string $jobId, BaseUriCollection $baseUris) - { - $this->queue = $queue; - $this->jobId = $jobId; - $this->baseUris = $baseUris; + private \DateTimeImmutable $startTime; + private function __construct( + private readonly QueueInterface $queue, + private readonly string $jobId, + private readonly BaseUriCollection $baseUris, + ) { + $this->clock = new NativeClock(); $this->userAgent = self::DEFAULT_USER_AGENT; } @@ -157,7 +130,7 @@ public function getUserAgent(): string } /** - * @return SubscriberInterface[] + * @return array */ public function getSubscribers(): array { @@ -187,6 +160,22 @@ public function withMaxRequests(int $maxRequests): self return $new; } + public function withMaxDurationInSeconds(int $maxDurationInSeconds): self + { + $new = clone $this; + $new->maxDurationInSeconds = $maxDurationInSeconds; + + return $new; + } + + public function withClock(ClockInterface $clock): self + { + $new = clone $this; + $new->clock = $clock; + + return $new; + } + public function withConcurrency(int $concurrency): self { $new = clone $this; @@ -246,7 +235,7 @@ public function addSubscriber(SubscriberInterface $subscriber): self return $this; } - public function getLogger(): ?LoggerInterface + public function getLogger(): LoggerInterface|null { return $this->logger; } @@ -299,7 +288,7 @@ public static function createFromJobId(string $jobId, QueueInterface $queue): se return new self( $queue, $jobId, - $queue->getBaseUris($jobId) + $queue->getBaseUris($jobId), ); } @@ -314,12 +303,14 @@ public static function create(BaseUriCollection $baseUris, QueueInterface $queue return new self( $queue, $jobId, - $baseUris + $baseUris, ); } public function crawl(): void { + $this->startTime = $this->clock->now(); + while (true) { $responses = $this->prepareResponses(); @@ -332,7 +323,7 @@ public function crawl(): void $this->log( LogLevel::DEBUG, - sprintf('Finished crawling! Sent %d request(s).', $this->getRequestsSent()) + sprintf('Finished crawling! Sent %d request(s).', $this->getRequestsSent()), ); foreach ($this->subscribers as $subscriber) { @@ -351,6 +342,7 @@ public function crawl(): void * You can use Escargot::isMaxDepthReached() for that. * * @return CrawlUri the new CrawlUri instance + * * @throw \BadMethodCallException If max depth would be reached. */ public function addUriToQueue(UriInterface $uri, CrawlUri $foundOn, bool $processed = false): CrawlUri @@ -377,7 +369,7 @@ public function isMaxDepthReached(CrawlUri $foundOn): bool return $foundOn->getLevel() >= $this->getMaxDepth(); } - public function getCrawlUri(UriInterface $uri): ?CrawlUri + public function getCrawlUri(UriInterface $uri): CrawlUri|null { return $this->queue->get($this->jobId, $uri); } @@ -402,7 +394,7 @@ private function setLoggerToSubscriber(SubscriberInterface $subscriber): void { if (null !== $this->logger && $subscriber instanceof LoggerAwareInterface) { // Decorate logger to automatically pass the subscriber in the logging context - $logger = new SubscriberLogger($this->logger, \get_class($subscriber)); + $logger = new SubscriberLogger($this->logger, $subscriber::class); $subscriber->setLogger($logger); } } @@ -410,13 +402,13 @@ private function setLoggerToSubscriber(SubscriberInterface $subscriber): void /** * Logs a message to the logger if one was provided. */ - private function log(string $level, string $message, CrawlUri $crawlUri = null): void + private function log(string $level, string $message, CrawlUri|null $crawlUri = null): void { if (null === $this->logger) { return; } - $context = ['source' => static::class]; + $context = ['source' => self::class]; if (null !== $crawlUri) { $context['crawlUri'] = $crawlUri; @@ -467,13 +459,14 @@ private function processResponseChunk(ResponseInterface $response, ChunkInterfac if ($chunk->isFirst()) { // If the response was a redirect of an URI we have already crawled, we can early abort // this response as it has already been processed. - if ($response->getInfo('redirect_count') > 0 + if ( + $response->getInfo('redirect_count') > 0 && null !== $this->queue->get($this->getJobId(), CrawlUri::normalizeUri(HttpUriFactory::create((string) $response->getInfo('url')))) ) { $this->log( LogLevel::DEBUG, 'Skipped further response processing because crawler got redirected to an URI that\'s already been crawled.', - $crawlUri + $crawlUri, ); $response->cancel(); $this->finishRequest($response); @@ -487,6 +480,7 @@ private function processResponseChunk(ResponseInterface $response, ChunkInterfac $response->getHeaders(); $needsContent = false; + foreach ($this->subscribers as $subscriber) { $shouldRequestDecision = $this->getDecisionForSubscriber('shouldRequest', $crawlUri, $subscriber); if (SubscriberInterface::DECISION_NEGATIVE === $shouldRequestDecision) { @@ -527,10 +521,23 @@ private function processResponseChunk(ResponseInterface $response, ChunkInterfac */ private function prepareResponses(): array { + $response = null; $responses = []; + $hasMaxRequestsReached = $this->isMaxRequestsReached(); + $hasMaxDurationReached = $this->isMaxDurationInSecondsReached(); + + if ($hasMaxRequestsReached) { + $this->log(LogLevel::DEBUG, 'Configured max requests reached!'); + } + + if ($hasMaxDurationReached) { + $this->log(LogLevel::DEBUG, 'Configured max duration reached!'); + } + while (!$this->isMaxConcurrencyReached() - && !$this->isMaxRequestsReached() + && !$hasMaxRequestsReached + && !$hasMaxDurationReached && ($crawlUri = $this->queue->getNext($this->jobId)) ) { // Already processed, ignore @@ -547,7 +554,7 @@ private function prepareResponses(): array $this->log( LogLevel::DEBUG, 'Skipped because it\'s not a valid http(s) URI.', - $crawlUri + $crawlUri, ); continue; } @@ -570,13 +577,17 @@ private function prepareResponses(): array // Request delay if (0 !== $this->requestDelay) { - usleep($this->requestDelay); + $this->clock->sleep($this->requestDelay / 1_000_000); } try { - $response = $this->getClient()->request('GET', (string) $crawlUri->getUri(), [ - 'user_data' => $crawlUri, - ]); + $response = $this->getClient()->request( + 'GET', + (string) $crawlUri->getUri(), + [ + 'user_data' => $crawlUri, + ], + ); $responses[] = $response; // Mark the response as started @@ -591,12 +602,12 @@ private function prepareResponses(): array private function storeDecisionForSubscriber(string $key, CrawlUri $crawlUri, SubscriberInterface $subscriber, string $decision): void { - $this->decisionMap[$key][(string) $crawlUri->getUri().\get_class($subscriber)] = $decision; + $this->decisionMap[$key][(string) $crawlUri->getUri().$subscriber::class] = $decision; } private function getDecisionForSubscriber(string $key, CrawlUri $crawlUri, SubscriberInterface $subscriber): string { - return $this->decisionMap[$key][(string) $crawlUri->getUri().\get_class($subscriber)] ?? SubscriberInterface::DECISION_ABSTAIN; + return $this->decisionMap[$key][(string) $crawlUri->getUri().$subscriber::class] ?? SubscriberInterface::DECISION_ABSTAIN; } private function isMaxRequestsReached(): bool @@ -604,18 +615,27 @@ private function isMaxRequestsReached(): bool return 0 !== $this->maxRequests && $this->requestsSent >= $this->maxRequests; } + private function isMaxDurationInSecondsReached(): bool + { + if (0 === $this->maxDurationInSeconds) { + return false; + } + + return $this->clock->now() >= $this->startTime->add(new \DateInterval('PT'.$this->maxDurationInSeconds.'S')); + } + private function isMaxConcurrencyReached(): bool { return \count($this->runningRequests) >= $this->concurrency; } - private function handleException(ExceptionInterface $exception, CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface $chunk = null): void + private function handleException(ExceptionInterface $exception, CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface|null $chunk = null): void { // Log the exception $this->log( LogLevel::DEBUG, - sprintf('Exception of type "%s" occurred: %s', \get_class($exception), $exception->getMessage()), - $crawlUri + sprintf('Exception of type "%s" occurred: %s', $exception::class, $exception->getMessage()), + $crawlUri, ); // Mark the responses as finished @@ -633,16 +653,11 @@ private function handleException(ExceptionInterface $exception, CrawlUri $crawlU continue; } - switch (true) { - case $exception instanceof TransportExceptionInterface: - $subscriber->onTransportException($crawlUri, $exception, $response); - break; - case $exception instanceof HttpExceptionInterface: - $subscriber->onHttpException($crawlUri, $exception, $response, $chunk); - break; - default: - throw new \RuntimeException('Unknown exception type!'); - } + match (true) { + $exception instanceof TransportExceptionInterface => $subscriber->onTransportException($crawlUri, $exception, $response), + $exception instanceof HttpExceptionInterface => $subscriber->onHttpException($crawlUri, $exception, $response, $chunk), + default => throw new \RuntimeException('Unknown exception type!'), + }; } } diff --git a/src/Queue/DoctrineQueue.php b/src/Queue/DoctrineQueue.php index 0cabcdb..a685015 100644 --- a/src/Queue/DoctrineQueue.php +++ b/src/Queue/DoctrineQueue.php @@ -13,7 +13,6 @@ namespace Terminal42\Escargot\Queue; use Doctrine\DBAL\Connection; -use Doctrine\DBAL\Schema\Comparator; use Doctrine\DBAL\Schema\Table; use Doctrine\DBAL\Types\Types; use Psr\Http\Message\UriInterface; @@ -23,31 +22,16 @@ final class DoctrineQueue implements QueueInterface { - /** - * @var Connection - */ - private $connection; - - /** - * @var \Closure - */ - private $jobIdGenerator; - - /** - * @var string - */ - private $tableName; - - public function __construct(Connection $connection, \Closure $jobIdGenerator, ?string $tableName = null) - { - $this->connection = $connection; - $this->jobIdGenerator = $jobIdGenerator; - $this->tableName = $tableName ?? 'escargot'; + public function __construct( + private readonly Connection $connection, + private readonly \Closure $jobIdGenerator, + private readonly string $tableName = 'escargot', + ) { } public function createJobId(BaseUriCollection $baseUris): string { - $jobId = $this->jobIdGenerator->__invoke(); + $jobId = ($this->jobIdGenerator)(); foreach ($baseUris as $baseUri) { $this->add($jobId, new CrawlUri($baseUri, 0)); @@ -63,7 +47,9 @@ public function isJobIdValid(string $jobId): bool ->from($this->tableName) ->where('job_id = :jobId') ->setParameter('jobId', $jobId, Types::STRING) - ->setMaxResults(1); + ->setMaxResults(1) + ; + $stmt = method_exists($queryBuilder, 'executeQuery') ? $queryBuilder->executeQuery() : $queryBuilder->execute(); return (bool) $stmt->fetchOne(); @@ -74,7 +60,8 @@ public function deleteJobId(string $jobId): void $queryBuilder = $this->connection->createQueryBuilder() ->delete($this->tableName) ->where('job_id = :jobId') - ->setParameter('jobId', $jobId, Types::STRING); + ->setParameter('jobId', $jobId, Types::STRING) + ; method_exists($queryBuilder, 'executeQuery') ? $queryBuilder->executeQuery() : $queryBuilder->execute(); } @@ -89,7 +76,8 @@ public function getBaseUris(string $jobId): BaseUriCollection ->where('job_id = :jobId') ->andWhere('level = :level') ->setParameter('jobId', $jobId, Types::STRING) - ->setParameter('level', 0, Types::INTEGER); + ->setParameter('level', 0, Types::INTEGER) + ; $stmt = method_exists($queryBuilder, 'executeQuery') ? $queryBuilder->executeQuery() : $queryBuilder->execute(); @@ -102,7 +90,7 @@ public function getBaseUris(string $jobId): BaseUriCollection return $baseUris; } - public function get(string $jobId, UriInterface $uri): ?CrawlUri + public function get(string $jobId, UriInterface $uri): CrawlUri|null { $queryBuilder = $this->connection->createQueryBuilder() ->select('uri, level, processed, found_on, tags') @@ -111,7 +99,8 @@ public function get(string $jobId, UriInterface $uri): ?CrawlUri ->andWhere('uri_hash = :uri_hash') ->setParameter('jobId', $jobId, Types::STRING) ->setParameter('uri_hash', $this->getUriHash($uri), Types::STRING) - ->setMaxResults(1); + ->setMaxResults(1) + ; $stmt = method_exists($queryBuilder, 'executeQuery') ? $queryBuilder->executeQuery() : $queryBuilder->execute(); @@ -142,26 +131,29 @@ public function add(string $jobId, CrawlUri $crawlUri): void ]) ->setParameter('uri', (string) $crawlUri->getUri(), Types::STRING) ->setParameter('level', $crawlUri->getLevel(), Types::INTEGER) - ->setParameter('foundOn', $crawlUri->getFoundOn(), Types::STRING); + ->setParameter('foundOn', $crawlUri->getFoundOn(), Types::STRING) + ; } else { $queryBuilder ->update($this->tableName) ->set('processed', ':processed') ->set('tags', ':tags') ->where('job_id = :jobId') - ->andWhere('uri_hash = :uri_hash'); + ->andWhere('uri_hash = :uri_hash') + ; } $queryBuilder ->setParameter('jobId', $jobId, Types::STRING) ->setParameter('uri_hash', $this->getUriHash($crawlUri->getUri()), Types::STRING) ->setParameter('processed', $crawlUri->isProcessed(), Types::BOOLEAN) - ->setParameter('tags', implode(',', $crawlUri->getTags()), Types::TEXT); + ->setParameter('tags', implode(',', $crawlUri->getTags()), Types::TEXT) + ; method_exists($queryBuilder, 'executeQuery') ? $queryBuilder->executeQuery() : $queryBuilder->execute(); } - public function getNext(string $jobId, int $skip = 0): ?CrawlUri + public function getNext(string $jobId, int $skip = 0): CrawlUri|null { $queryBuilder = $this->connection->createQueryBuilder() ->select('uri, level, processed, found_on, tags') @@ -171,7 +163,8 @@ public function getNext(string $jobId, int $skip = 0): ?CrawlUri ->orderBy('id', 'ASC') ->setParameter('jobId', $jobId, Types::STRING) ->setParameter('processed', false, Types::BOOLEAN) - ->setMaxResults(1); + ->setMaxResults(1) + ; if ($skip > 0) { $queryBuilder->setFirstResult($skip); @@ -194,7 +187,8 @@ public function countAll(string $jobId): int ->from($this->tableName) ->where('job_id = :jobId') ->setParameter('jobId', $jobId, Types::STRING) - ->setMaxResults(1); + ->setMaxResults(1) + ; $stmt = method_exists($queryBuilder, 'executeQuery') ? $queryBuilder->executeQuery() : $queryBuilder->execute(); @@ -210,7 +204,8 @@ public function countPending(string $jobId): int ->andWhere('processed = :processed') ->setParameter('jobId', $jobId, Types::STRING) ->setParameter('processed', false, Types::BOOLEAN) - ->setMaxResults(1); + ->setMaxResults(1) + ; $stmt = method_exists($queryBuilder, 'executeQuery') ? $queryBuilder->executeQuery() : $queryBuilder->execute(); @@ -224,7 +219,8 @@ public function getAll(string $jobId): \Generator ->from($this->tableName) ->where('job_id = :jobId') ->orderBy('id', 'ASC') - ->setParameter('jobId', $jobId, Types::STRING); + ->setParameter('jobId', $jobId, Types::STRING) + ; $stmt = method_exists($queryBuilder, 'executeQuery') ? $queryBuilder->executeQuery() : $queryBuilder->execute(); @@ -239,19 +235,14 @@ public function createSchema(): void { $table = $this->getTableSchema(); - $schemaManager = method_exists($this->connection, 'createSchemaManager') ? - $this->connection->createSchemaManager() : - $this->connection->getSchemaManager() - ; + $schemaManager = $this->connection->createSchemaManager(); - if (!$schemaManager->tablesExist($this->tableName)) { + if (!$schemaManager->tablesExist([$this->tableName])) { $queries = $this->connection->getDatabasePlatform()->getCreateTableSQL($table); } else { - $comparator = method_exists($schemaManager, 'createComparator') ? - $schemaManager->createComparator() : - new Comparator() - ; - $tableDiff = $comparator->diffTable($schemaManager->listTableDetails($this->tableName), $table); + $comparator = $schemaManager->createComparator(); + + $tableDiff = $comparator->compareTables($schemaManager->introspectTable($this->tableName), $table); $queries = $this->connection->getDatabasePlatform()->getAlterTableSQL($tableDiff); } @@ -266,30 +257,38 @@ public function getTableSchema(): Table $table->addColumn('id', Types::BIGINT) ->setAutoincrement(true) - ->setNotnull(true); + ->setNotnull(true) + ; $table->addColumn('job_id', Types::GUID) - ->setNotnull(true); + ->setNotnull(true) + ; $table->addColumn('uri_hash', Types::STRING) ->setLength(40) ->setFixed(true) - ->setNotnull(true); + ->setNotnull(true) + ; $table->addColumn('uri', Types::TEXT) - ->setNotnull(true); + ->setNotnull(true) + ; $table->addColumn('found_on', Types::TEXT) - ->setNotnull(false); + ->setNotnull(false) + ; $table->addColumn('level', Types::INTEGER) - ->setNotnull(true); + ->setNotnull(true) + ; $table->addColumn('processed', Types::BOOLEAN) - ->setNotnull(true); + ->setNotnull(true) + ; $table->addColumn('tags', Types::TEXT) - ->setNotnull(false); + ->setNotnull(false) + ; $table->setPrimaryKey(['id']); $table->addIndex(['job_id']); diff --git a/src/Queue/InMemoryQueue.php b/src/Queue/InMemoryQueue.php index 0087ca8..203f3b8 100644 --- a/src/Queue/InMemoryQueue.php +++ b/src/Queue/InMemoryQueue.php @@ -19,14 +19,14 @@ final class InMemoryQueue implements QueueInterface { /** - * @var array> + * @var array> */ - private $baseUris = []; + private array $baseUris = []; /** - * @var array> + * @var array> */ - private $queue = []; + private array $queue = []; public function createJobId(BaseUriCollection $baseUris): string { @@ -57,7 +57,7 @@ public function getBaseUris(string $jobId): BaseUriCollection return $this->baseUris[$jobId]; } - public function get(string $jobId, UriInterface $uri): ?CrawlUri + public function get(string $jobId, UriInterface $uri): CrawlUri|null { return $this->queue[$jobId][(string) $uri] ?? null; } @@ -71,13 +71,14 @@ public function add(string $jobId, CrawlUri $crawlUri): void $this->queue[$jobId][(string) $crawlUri->getUri()] = $crawlUri; } - public function getNext(string $jobId, int $skip = 0): ?CrawlUri + public function getNext(string $jobId, int $skip = 0): CrawlUri|null { if (!isset($this->queue[$jobId])) { return null; } $i = 0; + foreach ($this->queue[$jobId] as $crawlUri) { if ($crawlUri->isProcessed()) { continue; diff --git a/src/Queue/LazyQueue.php b/src/Queue/LazyQueue.php index 4bf1003..ef94d2c 100644 --- a/src/Queue/LazyQueue.php +++ b/src/Queue/LazyQueue.php @@ -19,29 +19,16 @@ final class LazyQueue implements QueueInterface { /** - * @var QueueInterface + * @var array */ - private $primaryQueue; + private array $jobIdMapper = []; - /** - * @var QueueInterface - */ - private $secondaryQueue; - - /** - * @var array - */ - private $jobIdMapper = []; + private int $toSkip = 0; - /** - * @var int - */ - private $toSkip = 0; - - public function __construct(QueueInterface $primaryQueue, QueueInterface $secondaryQueue) - { - $this->primaryQueue = $primaryQueue; - $this->secondaryQueue = $secondaryQueue; + public function __construct( + private readonly QueueInterface $primaryQueue, + private readonly QueueInterface $secondaryQueue, + ) { } public function createJobId(BaseUriCollection $baseUris): string @@ -69,7 +56,7 @@ public function getBaseUris(string $jobId): BaseUriCollection return $this->primaryQueue->getBaseUris($this->getJobIdFromSecondaryJobId($jobId)); } - public function get(string $jobId, UriInterface $uri): ?CrawlUri + public function get(string $jobId, UriInterface $uri): CrawlUri|null { // If we have it in the primary queue, early return $crawlUri = $this->primaryQueue->get($this->getJobIdFromSecondaryJobId($jobId), $uri); @@ -100,7 +87,7 @@ public function add(string $jobId, CrawlUri $crawlUri): void } } - public function getNext(string $jobId, int $skip = 0): ?CrawlUri + public function getNext(string $jobId, int $skip = 0): CrawlUri|null { // If we have it in the primary queue, early return $next = $this->primaryQueue->getNext($this->getJobIdFromSecondaryJobId($jobId), $skip); diff --git a/src/Queue/QueueInterface.php b/src/Queue/QueueInterface.php index 9d1e61c..dd6b2c8 100644 --- a/src/Queue/QueueInterface.php +++ b/src/Queue/QueueInterface.php @@ -49,7 +49,7 @@ public function getBaseUris(string $jobId): BaseUriCollection; * Returns a CrawlUri for a given UriInterface if already * added to the queue. */ - public function get(string $jobId, UriInterface $baseUri): ?CrawlUri; + public function get(string $jobId, UriInterface $baseUri): CrawlUri|null; /** * Adds a new CrawlUri instance to the queue. @@ -76,7 +76,7 @@ public function add(string $jobId, CrawlUri $crawlUri): void; * useful for the LazyQueue implementation so it can skip n entries * it's already processed and stored in the primary queue. */ - public function getNext(string $jobId, int $skip = 0): ?CrawlUri; + public function getNext(string $jobId, int $skip = 0): CrawlUri|null; /** * Returns the total of all URIs. @@ -91,7 +91,7 @@ public function countPending(string $jobId): int; /** * Returns all CrawlUri instances in the queue. * - * @return \Generator & iterable + * @return \Generator&iterable */ public function getAll(string $jobId): \Generator; } diff --git a/src/Subscriber/HtmlCrawlerSubscriber.php b/src/Subscriber/HtmlCrawlerSubscriber.php index 220158e..f919f19 100644 --- a/src/Subscriber/HtmlCrawlerSubscriber.php +++ b/src/Subscriber/HtmlCrawlerSubscriber.php @@ -31,6 +31,7 @@ final class HtmlCrawlerSubscriber implements SubscriberInterface, EscargotAwareI use SubscriberLoggerTrait; public const TAG_REL_NOFOLLOW = 'rel-nofollow'; + public const TAG_NO_TEXT_HTML_TYPE = 'no-txt-html-type'; public function shouldRequest(CrawlUri $crawlUri): string @@ -62,6 +63,7 @@ public function onLastChunk(CrawlUri $crawlUri, ResponseInterface $response, Chu // Links $linkCrawler = $crawler->filterXPath('descendant-or-self::a'); + foreach ($linkCrawler->links() as $link) { $this->addNewUriToQueueFromNode($crawlUri, $link->getUri(), $link->getNode()); } @@ -82,14 +84,14 @@ private function addNewUriToQueueFromNode(CrawlUri $crawlUri, string $uri, \DOME try { $uri = HttpUriFactory::create($uri); - } catch (\InvalidArgumentException $e) { + } catch (\InvalidArgumentException) { $this->logWithCrawlUri( $crawlUri, LogLevel::DEBUG, sprintf( 'Could not add "%s" to the queue because the link is invalid.', - $uri - ) + $uri, + ), ); return; @@ -104,8 +106,8 @@ private function addNewUriToQueueFromNode(CrawlUri $crawlUri, string $uri, \DOME LogLevel::DEBUG, sprintf( 'Did not add "%s" to the queue because it was marked as "data-escargot-ignore".', - $uri - ) + $uri, + ), ); return; @@ -117,14 +119,14 @@ private function addNewUriToQueueFromNode(CrawlUri $crawlUri, string $uri, \DOME // Add all data attributes as tags for e.g. other subscribers if ($node->hasAttributes()) { foreach ($node->attributes as $attribute) { - if (0 === strpos($attribute->name, 'data-')) { - $newCrawlUri->addTag(substr($attribute->name, 5)); + if (str_starts_with((string) $attribute->name, 'data-')) { + $newCrawlUri->addTag(substr((string) $attribute->name, 5)); } } } // Add a tag to the new CrawlUri instance if it was marked with rel="nofollow" - if ($node->hasAttribute('rel') && false !== strpos($node->getAttribute('rel'), 'nofollow')) { + if ($node->hasAttribute('rel') && str_contains($node->getAttribute('rel'), 'nofollow')) { $newCrawlUri->addTag(self::TAG_REL_NOFOLLOW); } diff --git a/src/Subscriber/RobotsSubscriber.php b/src/Subscriber/RobotsSubscriber.php index 19239a5..4a8bf81 100644 --- a/src/Subscriber/RobotsSubscriber.php +++ b/src/Subscriber/RobotsSubscriber.php @@ -12,7 +12,6 @@ namespace Terminal42\Escargot\Subscriber; -use Nyholm\Psr7\Uri; use Psr\Http\Message\UriInterface; use Psr\Log\LoggerAwareInterface; use Psr\Log\LoggerAwareTrait; @@ -38,18 +37,18 @@ final class RobotsSubscriber implements SubscriberInterface, EscargotAwareInterf use SubscriberLoggerTrait; public const TAG_NOINDEX = 'noindex'; + public const TAG_NOFOLLOW = 'nofollow'; + public const TAG_DISALLOWED_ROBOTS_TXT = 'disallowed-robots-txt'; + public const TAG_IS_SITEMAP = 'is-sitemap'; /** - * @var array + * @var array */ - private $robotsTxtCache = []; + private array $robotsTxtCache = []; - /** - * {@inheritdoc} - */ public function shouldRequest(CrawlUri $crawlUri): string { // Check if it is a sitemap previously found @@ -64,9 +63,6 @@ public function shouldRequest(CrawlUri $crawlUri): string return self::DECISION_ABSTAIN; } - /** - * {@inheritdoc} - */ public function needsContent(CrawlUri $crawlUri, ResponseInterface $response, ChunkInterface $chunk): string { // Check if it is a sitemap previously found @@ -80,7 +76,7 @@ public function needsContent(CrawlUri $crawlUri, ResponseInterface $response, Ch $this->handleNoindexNofollowTags( $crawlUri, $xRobotsTagValue, - 'Added the "%tag%" tag because the X-Robots-Tag header contained "%value%".' + 'Added the "%tag%" tag because the X-Robots-Tag header contained "%value%".', ); } @@ -109,7 +105,7 @@ public function onLastChunk(CrawlUri $crawlUri, ResponseInterface $response, Chu $this->handleNoindexNofollowTags( $crawlUri, $robotsMetaTagValue, - 'Added the "%tag%" tag because the tag contained "%value%".' + 'Added the "%tag%" tag because the tag contained "%value%".', ); } @@ -117,11 +113,11 @@ private function handleNoindexNofollowTags(CrawlUri $crawlUri, string $value, st { $tags = []; - if (false !== strpos($value, 'noindex')) { + if (str_contains($value, 'noindex')) { $tags[] = self::TAG_NOINDEX; } - if (false !== strpos($value, 'nofollow')) { + if (str_contains($value, 'nofollow')) { $tags[] = self::TAG_NOFOLLOW; } @@ -131,7 +127,7 @@ private function handleNoindexNofollowTags(CrawlUri $crawlUri, string $value, st $this->logWithCrawlUri( $crawlUri, LogLevel::DEBUG, - str_replace(['%value%', '%tag%'], [$value, $tag], $messageTpl) + str_replace(['%value%', '%tag%'], [$value, $tag], $messageTpl), ); } } @@ -160,13 +156,13 @@ private function handleDisallowedByRobotsTxtTag(CrawlUri $crawlUri): void LogLevel::DEBUG, sprintf( 'Added the "%s" tag because of the robots.txt content.', - self::TAG_DISALLOWED_ROBOTS_TXT - ) + self::TAG_DISALLOWED_ROBOTS_TXT, + ), ); } } - private function getRobotsTxtFile(CrawlUri $crawlUri): ?File + private function getRobotsTxtFile(CrawlUri $crawlUri): File|null { $robotsTxtUri = $this->getRobotsTxtUri($crawlUri); @@ -180,7 +176,7 @@ private function getRobotsTxtFile(CrawlUri $crawlUri): ?File try { $robotsTxtContent = $response->getContent(); - } catch (HttpExceptionInterface $e) { + } catch (HttpExceptionInterface) { return $this->robotsTxtCache[(string) $robotsTxtUri] = null; } @@ -188,7 +184,7 @@ private function getRobotsTxtFile(CrawlUri $crawlUri): ?File $parser->setSource($robotsTxtContent); return $this->robotsTxtCache[(string) $robotsTxtUri] = $parser->getFile(); - } catch (TransportExceptionInterface $exception) { + } catch (TransportExceptionInterface) { return $this->robotsTxtCache[(string) $robotsTxtUri] = null; } } @@ -213,14 +209,14 @@ private function handleSitemap(CrawlUri $crawlUri, File $robotsTxt): void foreach ($robotsTxt->getNonGroupDirectives()->getByField('sitemap')->getDirectives() as $directive) { try { $sitemapUri = HttpUriFactory::create($directive->getValue()->get()); - } catch (\InvalidArgumentException $e) { + } catch (\InvalidArgumentException) { $this->logWithCrawlUri( $crawlUri, LogLevel::DEBUG, sprintf( 'Could not add sitemap URI "%s" to the queue because the URI is invalid.', - $directive->getValue()->get() - ) + $directive->getValue()->get(), + ), ); continue; } @@ -242,31 +238,34 @@ private function extractUrisFromSitemap(CrawlUri $sitemapUri, string $content): return; } - set_error_handler(function ($errno, $errstr): void { - throw new \Exception($errstr, $errno); - }); + set_error_handler( + static function ($errno, $errstr): never { + throw new \Exception($errstr, $errno); + }, + ); + try { $urls = new \SimpleXMLElement($content); - } catch (\Exception $exception) { + } catch (\Exception) { return; } finally { restore_error_handler(); } - $sitemapIndex = ('sitemapindex' === $urls->getName()); + $sitemapIndex = 'sitemapindex' === $urls->getName(); foreach ($urls as $url) { // Add it to the queue if not present already try { $uri = HttpUriFactory::create((string) $url->loc); - } catch (\InvalidArgumentException $e) { + } catch (\InvalidArgumentException) { $this->logWithCrawlUri( $sitemapUri, LogLevel::DEBUG, sprintf( 'Could not add URI "%s" found on in the sitemap to the queue because the URI is invalid.', - (string) $url->loc - ) + (string) $url->loc, + ), ); continue; diff --git a/src/Subscriber/SubscriberInterface.php b/src/Subscriber/SubscriberInterface.php index abdfc55..6a62d51 100644 --- a/src/Subscriber/SubscriberInterface.php +++ b/src/Subscriber/SubscriberInterface.php @@ -19,7 +19,9 @@ interface SubscriberInterface { public const DECISION_POSITIVE = 'positive'; + public const DECISION_ABSTAIN = 'abstain'; + public const DECISION_NEGATIVE = 'negative'; /** diff --git a/src/Subscriber/Util.php b/src/Subscriber/Util.php index cbee4db..6b6ae2c 100644 --- a/src/Subscriber/Util.php +++ b/src/Subscriber/Util.php @@ -24,7 +24,7 @@ public static function isOfContentType(ResponseInterface $response, string $cont return false; } - return false !== strpos($response->getHeaders()['content-type'][0], $contentType); + return str_contains($response->getHeaders()['content-type'][0], $contentType); } /** diff --git a/src/SubscriberLogger.php b/src/SubscriberLogger.php index 50e9e0e..94d656f 100644 --- a/src/SubscriberLogger.php +++ b/src/SubscriberLogger.php @@ -17,25 +17,17 @@ class SubscriberLogger extends AbstractLogger { - /** - * @var LoggerInterface - */ - private $decorated; + private string|null $subscriberClass = null; - /** - * @var string - */ - private $subscriberClass; - - public function __construct(LoggerInterface $decorated, string $subscriberClass) - { + public function __construct( + private readonly LoggerInterface $decorated, + string $subscriberClass, + ) { // Anonymous class names contain null bytes so let's standardize them a little - if (false !== strpos($subscriberClass, '@anonymous')) { + if (str_contains($subscriberClass, '@anonymous')) { $subscriberClass = 'class@anonymous:'.basename($subscriberClass); $subscriberClass = preg_replace('/\.php(.+)$/', '', $subscriberClass); } - - $this->decorated = $decorated; $this->subscriberClass = $subscriberClass; } @@ -44,9 +36,6 @@ public function logWithCrawlUri(CrawlUri $crawlUri, string $level, string $messa $this->log($level, $message, ['crawlUri' => $crawlUri]); } - /** - * {@inheritdoc} - */ public function log($level, $message, array $context = []): void { $context = array_merge($context, ['source' => $this->subscriberClass]); diff --git a/tests/EscargotTest.php b/tests/EscargotTest.php index dc747b3..64c9c80 100644 --- a/tests/EscargotTest.php +++ b/tests/EscargotTest.php @@ -19,6 +19,7 @@ use Psr\Log\LoggerInterface; use Psr\Log\LogLevel; use Psr\Log\Test\TestLogger; +use Symfony\Component\Clock\MockClock; use Symfony\Component\Finder\Finder; use Symfony\Component\HttpClient\MockHttpClient; use Symfony\Contracts\HttpClient\ChunkInterface; @@ -37,6 +38,7 @@ use Terminal42\Escargot\Subscriber\TagValueResolvingSubscriberInterface; use Terminal42\Escargot\SubscriberLogger; use Terminal42\Escargot\SubscriberLoggerTrait; +use Terminal42\Escargot\Tests\Scenario\MockResponseFactory; use Terminal42\Escargot\Tests\Scenario\Scenario; class EscargotTest extends TestCase @@ -70,14 +72,17 @@ public function testWithers(): void $subscriber = $this->createMock(CompleteSubscriber::class); $subscriber ->expects($this->exactly(5)) - ->method('setEscargot'); + ->method('setEscargot') + ; + $subscriber ->expects($this->once()) ->method('setLogger') - ->with($this->callback(function (LoggerInterface $logger) { + ->with($this->callback( // Must be decorated - return $logger instanceof SubscriberLogger; - })); + static fn (LoggerInterface $logger) => $logger instanceof SubscriberLogger, + )) + ; $escargot->addSubscriber($subscriber); @@ -180,6 +185,55 @@ public function resolveTagValue(string $tag) $this->assertSame('success', $escargot->resolveTagValue('foobar')); } + public function testMaxDuration(): void + { + $mockResponse = <<<'HTML' + HTTP/2.0 200 OK + content-type: text/html; charset=UTF-8 + + + + + + Link + + + HTML; + + $baseUris = new BaseUriCollection(); + $baseUris->add(new Uri('https://www.terminal42.ch')); + $queue = new InMemoryQueue(); + $clock = new MockClock(); + $client = new MockHttpClient( + static function ($method, $url) use ($clock, $mockResponse) { + $clock->sleep(1); // Mock the request that takes a second to complete + + return MockResponseFactory::createFromString(sprintf($mockResponse, uniqid())); + }, + ); + $logger = new TestLogger(); + + $escargot = Escargot::create($baseUris, $queue) + ->withLogger($logger) + ->withHttpClient($client) + ->withMaxDurationInSeconds(5) + ->withClock($clock) + ; + + $escargot->addSubscriber(new HtmlCrawlerSubscriber()); + $escargot->addSubscriber($this->getSearchIndexSubscriber()); + + $escargot->crawl(); + + $this->assertSame( + [ + '[Terminal42\Escargot\Escargot] Configured max duration reached!', + '[Terminal42\Escargot\Escargot] Finished crawling! Sent 5 request(s).', + ], + $this->cleanLogs($logger), + ); + } + /** * @dataProvider crawlProvider */ @@ -193,7 +247,7 @@ public function testCrawlAsWebCrawler(\Closure $responseFactory, array $expected $escargot = Escargot::create($baseUris, $queue); $escargot = $escargot->withHttpClient(new MockHttpClient($responseFactory)); - if (0 !== \count($options)) { + if (0 !== (is_countable($options) ? \count($options) : 0)) { if (\array_key_exists('max_requests', $options)) { $escargot = $escargot->withMaxRequests((int) $options['max_requests']); } @@ -216,25 +270,11 @@ public function testCrawlAsWebCrawler(\Closure $responseFactory, array $expected $escargot->crawl(); - $filteredLogs = array_map(function (array $record) { - $message = $record['message']; - - if (isset($record['context']['crawlUri'])) { - $message = sprintf('[%s] %s', (string) $record['context']['crawlUri'], $message); - } - - if (isset($record['context']['source'])) { - $message = sprintf('[%s] %s', $record['context']['source'], $message); - } - - return $message; - }, $logger->records); + $filteredLogs = $this->cleanLogs($logger); $this->assertSame($expectedLogs, $filteredLogs, $message); - $filteredRequests = array_map(function (CrawlUri $crawlUri) { - return sprintf('Successful request! %s.', (string) $crawlUri); - }, $indexerSubscriber->getUris()); + $filteredRequests = array_map(static fn (CrawlUri $crawlUri) => sprintf('Successful request! %s.', (string) $crawlUri), $indexerSubscriber->getUris()); $this->assertSame($expectedRequests, $filteredRequests, $message); } @@ -255,6 +295,26 @@ public function crawlProvider(): \Generator } } + private function cleanLogs(TestLogger $testLogger): array + { + return array_map( + static function (array $record) { + $message = $record['message']; + + if (isset($record['context']['crawlUri'])) { + $message = sprintf('[%s] %s', (string) $record['context']['crawlUri'], $message); + } + + if (isset($record['context']['source'])) { + $message = sprintf('[%s] %s', $record['context']['source'], $message); + } + + return $message; + }, + $testLogger->records, + ); + } + private function getSearchIndexSubscriber(): SubscriberInterface { return new class() implements SubscriberInterface, EscargotAwareInterface, LoggerAwareInterface { @@ -262,7 +322,7 @@ private function getSearchIndexSubscriber(): SubscriberInterface use LoggerAwareTrait; use SubscriberLoggerTrait; - private $uris = []; + private array $uris = []; public function getUris(): array { @@ -277,7 +337,7 @@ public function shouldRequest(CrawlUri $crawlUri): string $this->logWithCrawlUri( $crawlUri, LogLevel::DEBUG, - 'Do not request because when the crawl URI was found, the robots information disallowed following this URI.' + 'Do not request because when the crawl URI was found, the robots information disallowed following this URI.', ); return SubscriberInterface::DECISION_NEGATIVE; @@ -289,7 +349,7 @@ public function shouldRequest(CrawlUri $crawlUri): string $this->logWithCrawlUri( $crawlUri, LogLevel::DEBUG, - 'Do not request because it was disallowed by the robots.txt.' + 'Do not request because it was disallowed by the robots.txt.', ); return SubscriberInterface::DECISION_NEGATIVE; @@ -300,7 +360,7 @@ public function shouldRequest(CrawlUri $crawlUri): string $this->logWithCrawlUri( $crawlUri, LogLevel::DEBUG, - 'Do not request because when the crawl URI was found, the "rel" attribute contained "nofollow".' + 'Do not request because when the crawl URI was found, the "rel" attribute contained "nofollow".', ); return SubscriberInterface::DECISION_NEGATIVE; @@ -311,7 +371,7 @@ public function shouldRequest(CrawlUri $crawlUri): string $this->logWithCrawlUri( $crawlUri, LogLevel::DEBUG, - 'Do not request because when the crawl URI was found, the "type" attribute was present and did not contain "text/html".' + 'Do not request because when the crawl URI was found, the "type" attribute was present and did not contain "text/html".', ); return SubscriberInterface::DECISION_NEGATIVE; diff --git a/tests/Fixtures/scenario6/_logs.txt b/tests/Fixtures/scenario6/_logs.txt index 7fbb594..0a2cea9 100644 --- a/tests/Fixtures/scenario6/_logs.txt +++ b/tests/Fixtures/scenario6/_logs.txt @@ -1 +1,2 @@ +[Terminal42\Escargot\Escargot] Configured max requests reached! [Terminal42\Escargot\Escargot] Finished crawling! Sent 2 request(s). diff --git a/tests/Queue/DoctrineQueueTest.php b/tests/Queue/DoctrineQueueTest.php index 97dd78a..aca9a16 100644 --- a/tests/Queue/DoctrineQueueTest.php +++ b/tests/Queue/DoctrineQueueTest.php @@ -12,23 +12,28 @@ namespace Terminal42\Escargot\Tests\Queue; +use Doctrine\DBAL\Configuration; use Doctrine\DBAL\DriverManager; +use Doctrine\DBAL\Schema\DefaultSchemaManagerFactory; use Doctrine\DBAL\Schema\Table; +use Doctrine\DBAL\Tools\DsnParser; use Terminal42\Escargot\Queue\DoctrineQueue; use Terminal42\Escargot\Queue\QueueInterface; class DoctrineQueueTest extends AbstractQueueTest { - /** - * @var DoctrineQueue - */ - private $queue; + private DoctrineQueue $queue; protected function setUp(): void { - $this->queue = new DoctrineQueue(DriverManager::getConnection(['url' => 'sqlite://:memory:']), function () { - return 'foobar'; - }); + $configuration = new Configuration(); + $configuration->setSchemaManagerFactory(new DefaultSchemaManagerFactory()); + $connection = DriverManager::getConnection( + (new DsnParser(['sqlite' => 'pdo_sqlite']))->parse('sqlite://:memory:'), + $configuration, + ); + + $this->queue = new DoctrineQueue($connection, static fn () => 'foobar'); $this->queue->createSchema(); } diff --git a/tests/Queue/LazyQueueTest.php b/tests/Queue/LazyQueueTest.php index 605deb5..5c97128 100644 --- a/tests/Queue/LazyQueueTest.php +++ b/tests/Queue/LazyQueueTest.php @@ -12,7 +12,10 @@ namespace Terminal42\Escargot\Tests\Queue; +use Doctrine\DBAL\Configuration; use Doctrine\DBAL\DriverManager; +use Doctrine\DBAL\Schema\DefaultSchemaManagerFactory; +use Doctrine\DBAL\Tools\DsnParser; use Terminal42\Escargot\Queue\DoctrineQueue; use Terminal42\Escargot\Queue\InMemoryQueue; use Terminal42\Escargot\Queue\LazyQueue; @@ -20,16 +23,18 @@ class LazyQueueTest extends AbstractQueueTest { - /** - * @var DoctrineQueue - */ - private $queue; + private DoctrineQueue $queue; protected function setUp(): void { - $this->queue = new DoctrineQueue(DriverManager::getConnection(['url' => 'sqlite://:memory:']), function () { - return 'foobar'; - }); + $configuration = new Configuration(); + $configuration->setSchemaManagerFactory(new DefaultSchemaManagerFactory()); + $connection = DriverManager::getConnection( + (new DsnParser(['sqlite' => 'pdo_sqlite']))->parse('sqlite://:memory:'), + $configuration, + ); + + $this->queue = new DoctrineQueue($connection, static fn () => 'foobar'); $this->queue->createSchema(); } diff --git a/tests/Scenario/MockResponseFactory.php b/tests/Scenario/MockResponseFactory.php index 77f06ab..91e2ef1 100644 --- a/tests/Scenario/MockResponseFactory.php +++ b/tests/Scenario/MockResponseFactory.php @@ -35,7 +35,7 @@ public static function createFromString(string $contents): MockResponse } if (isset($mappedHeaders['x-escargottest-info'])) { - $info = array_merge($info, json_decode($mappedHeaders['x-escargottest-info'][0], true)); + $info = array_merge($info, json_decode($mappedHeaders['x-escargottest-info'][0], true, 512, JSON_THROW_ON_ERROR)); unset($mappedHeaders['x-escargottest-info']); } diff --git a/tests/Scenario/Scenario.php b/tests/Scenario/Scenario.php index 85b371b..c3ec5a6 100644 --- a/tests/Scenario/Scenario.php +++ b/tests/Scenario/Scenario.php @@ -16,49 +16,23 @@ class Scenario { - /** - * @var string - */ - private $name; - - /** - * @var string - */ - private $path; + private array|null $responses = null; - /** - * @var array - */ - private $responses; - - /** - * @var array - */ - private $requests = []; + private array $requests = []; - /** - * @var array - */ - private $logs = []; + private array $logs = []; - /** - * @var array - */ - private $options = []; + private array $options = []; - /** - * @var string - */ - private $description = 'No scenario description given'; + private string $description = 'No scenario description given'; /** * Scenario constructor. */ - public function __construct(string $name, string $path) - { - $this->name = $name; - $this->path = $path; - + public function __construct( + private readonly string $name, + private readonly string $path, + ) { $this->build(); } diff --git a/tests/Subscriber/UtilTest.php b/tests/Subscriber/UtilTest.php index 474c1a5..e054257 100644 --- a/tests/Subscriber/UtilTest.php +++ b/tests/Subscriber/UtilTest.php @@ -36,7 +36,7 @@ public function isAllowedToFollowProvider(): \Generator yield 'Found on an URI that should not be followed according to the x-robots-tag header or information' => [ new CrawlUri(new Uri('https://www.terminal42.ch/foobar'), 1, false, new Uri('https://www.terminal42.ch')), $this->createEscargotWithFoundOnUri( - (new CrawlUri(new Uri('https://www.terminal42.ch'), 0, true))->addTag(RobotsSubscriber::TAG_NOFOLLOW) + (new CrawlUri(new Uri('https://www.terminal42.ch'), 0, true))->addTag(RobotsSubscriber::TAG_NOFOLLOW), ), false, ];