From 0bde9ae2a611d7d36c5ba5afb291b174c2cccb27 Mon Sep 17 00:00:00 2001 From: IgorA100 Date: Mon, 16 Oct 2023 17:12:49 +0300 Subject: [PATCH] Fix: Relative links (#419) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix: Relative URLs in links Feed discovery with relative URLs in links Example: https://k47.cz/ Solution to the problem: #417 * Update Feed.php (Fix relative URLs in links) We will replace the links (add the host if it doesn’t exist) in the content as well * Update Node.php (Fix relative URLs in links) * Update Reader.php (Fix relative URLs in links) Remember the URL for further processing * Update Link.php (Fix relative URLs in links) * Update XmlParser.php (Fix relative URLs in links) * Update Explorer.php * Update Node.php Delete tabs --- src/FeedIo/Explorer.php | 7 +++++-- src/FeedIo/Feed.php | 2 ++ src/FeedIo/Feed/Node.php | 27 +++++++++++++++++++++++++++ src/FeedIo/Parser/XmlParser.php | 3 ++- src/FeedIo/Reader.php | 1 + src/FeedIo/Rule/Link.php | 6 +++++- 6 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/FeedIo/Explorer.php b/src/FeedIo/Explorer.php index eb0b86d..9d9846a 100644 --- a/src/FeedIo/Explorer.php +++ b/src/FeedIo/Explorer.php @@ -29,14 +29,14 @@ public function discover(string $url): array $stream = $this->client->getResponse($url, new DateTime('@0')); $internalErrors = libxml_use_internal_errors(true); - $feeds = $this->extractFeeds($stream->getBody()); + $feeds = $this->extractFeeds($stream->getBody(), $url); libxml_use_internal_errors($internalErrors); return $feeds; } - protected function extractFeeds(string $html): array + protected function extractFeeds(string $html, string $url = null): array { $dom = new DOMDocument(); $dom->loadHTML($html); @@ -53,6 +53,9 @@ protected function extractFeeds(string $html): array // returning $href = 'https:' . $href; } + if (!parse_url($href, PHP_URL_HOST) && $url){ + $href = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . '/' . ltrim($href,'/'); + } $feeds[] = $href; } } diff --git a/src/FeedIo/Feed.php b/src/FeedIo/Feed.php index 75b9739..bec8337 100644 --- a/src/FeedIo/Feed.php +++ b/src/FeedIo/Feed.php @@ -134,6 +134,8 @@ public function rewind(): void public function add(ItemInterface $item): FeedInterface { + $item->setHostInContent($this->getHostFromLink()); + if ($item->getLastModified() > $this->getLastModified()) { $this->setLastModified($item->getLastModified()); } diff --git a/src/FeedIo/Feed/Node.php b/src/FeedIo/Feed/Node.php index fa251c2..e29ab31 100644 --- a/src/FeedIo/Feed/Node.php +++ b/src/FeedIo/Feed/Node.php @@ -150,6 +150,33 @@ protected function setHost(string $link = null): void } } + protected function setHostInContent(string $host = null): void + { + if (property_exists($this, 'content')){ + if (!is_null($host) && !is_null($this->content)) { + $this->content = preg_replace('!(<*\s*[^>]*)(href=)(.?)(\/[^\/])!','\1 href=\3'.$host.'\4', $this->content ); + $this->content = preg_replace('!(<*\s*[^>]*)(src=)(.?)(\/[^\/])!','\1 src=\3'.$host.'\4', $this->content ); + } + } + if (property_exists($this, 'description')){ + if (!is_null($host) && !is_null($this->description)) { + $this->description = preg_replace('!(<*\s*[^>]*)(href=)(.?)(\/[^\/])!','\1 href=\3'.$host.'\4', $this->description ); + $this->description = preg_replace('!(<*\s*[^>]*)(src=)(.?)(\/[^\/])!','\1 src=\3'.$host.'\4', $this->description ); + } + } + } + + public function getHostFromLink(): ?string + { + if (!is_null($this->getLink())) { + $partsUrl = parse_url($this->getLink()); + $result = $partsUrl['scheme']."://".$partsUrl['host']; + } else + $result = null; + + return $result; + } + public function getValue(string $name): ?string { foreach ($this->getElementIterator($name) as $element) { diff --git a/src/FeedIo/Parser/XmlParser.php b/src/FeedIo/Parser/XmlParser.php index 272391d..3156bc3 100644 --- a/src/FeedIo/Parser/XmlParser.php +++ b/src/FeedIo/Parser/XmlParser.php @@ -76,7 +76,8 @@ public function parseNode(NodeInterface $item, DOMElement $element, RuleSet $rul protected function handleNode(NodeInterface $item, DOMElement $node, RuleSet $ruleSet): void { if ($this->isItem($node->tagName) && $item instanceof FeedInterface) { - $newItem = $this->parseNode($item->newItem(), $node, $this->getItemRuleSet()); + $linkItem = $item->getLink(); + $newItem = $this->parseNode($item->newItem()->setLink($linkItem), $node, $this->getItemRuleSet()); $this->addValidItem($item, $newItem); } else { $rule = $ruleSet->get($node->tagName); diff --git a/src/FeedIo/Reader.php b/src/FeedIo/Reader.php index 6f54901..5687eeb 100644 --- a/src/FeedIo/Reader.php +++ b/src/FeedIo/Reader.php @@ -73,6 +73,7 @@ public function read(string $url, FeedInterface $feed, DateTime $modifiedSince = try { $this->logger->info("hitting {$url}"); $response = $this->client->getResponse($url, $modifiedSince); + $feed->setLink($url); $document = $this->handleResponse($response, $feed); return new Result($document, $feed, $modifiedSince, $response, $url); diff --git a/src/FeedIo/Rule/Link.php b/src/FeedIo/Rule/Link.php index 2cd1231..ca65a8c 100644 --- a/src/FeedIo/Rule/Link.php +++ b/src/FeedIo/Rule/Link.php @@ -17,7 +17,11 @@ class Link extends RuleAbstract */ public function setProperty(NodeInterface $node, \DOMElement $element): void { - $node->setLink($element->nodeValue); + $nodeValue = $element->nodeValue; + if (parse_url($nodeValue, PHP_URL_HOST) == null) { + $nodeValue = $node->getHostFromLink(). $nodeValue; + } + $node->setLink($nodeValue); } /**