From d27fb225c2fadb454a4bb19394798a096a72aa1b Mon Sep 17 00:00:00 2001 From: --system Date: Mon, 6 Jan 2025 22:25:55 +0100 Subject: [PATCH] [TASK] Split up ContentParser to multiple classes and interfaces --- .../ContentExtractors/BaseUrlParser.php | 19 +++ .../BaseUrlParserInterface.php | 10 ++ .../ContentExtractors/BodyProcessor.php | 52 ++++++ .../BodyProcessorInterface.php | 10 ++ .../ContentMetadataExtractor.php | 48 ++++++ .../ContentMetadataExtractorInterface.php | 12 ++ .../ContentExtractors/FaviconExtractor.php | 22 +++ .../FaviconExtractorInterface.php | 10 ++ .../TitleConfigurationExtractor.php | 24 +++ .../TitleConfigurationExtractorInterface.php | 13 ++ Classes/Service/Preview/ContentParser.php | 153 +++--------------- Configuration/Services.yaml | 9 ++ 12 files changed, 250 insertions(+), 132 deletions(-) create mode 100644 Classes/Service/Preview/ContentExtractors/BaseUrlParser.php create mode 100644 Classes/Service/Preview/ContentExtractors/BaseUrlParserInterface.php create mode 100644 Classes/Service/Preview/ContentExtractors/BodyProcessor.php create mode 100644 Classes/Service/Preview/ContentExtractors/BodyProcessorInterface.php create mode 100644 Classes/Service/Preview/ContentExtractors/ContentMetadataExtractor.php create mode 100644 Classes/Service/Preview/ContentExtractors/ContentMetadataExtractorInterface.php create mode 100644 Classes/Service/Preview/ContentExtractors/FaviconExtractor.php create mode 100644 Classes/Service/Preview/ContentExtractors/FaviconExtractorInterface.php create mode 100644 Classes/Service/Preview/ContentExtractors/TitleConfigurationExtractor.php create mode 100644 Classes/Service/Preview/ContentExtractors/TitleConfigurationExtractorInterface.php diff --git a/Classes/Service/Preview/ContentExtractors/BaseUrlParser.php b/Classes/Service/Preview/ContentExtractors/BaseUrlParser.php new file mode 100644 index 00000000..983548f0 --- /dev/null +++ b/Classes/Service/Preview/ContentExtractors/BaseUrlParser.php @@ -0,0 +1,19 @@ +]*>(.*)<\/body>/is", $content, $matchesBody); + + if ($bodyFound) { + $body = $matchesBody[1]; + + preg_match_all( + '/.*?/mis', + $body, + $indexableContents + ); + + if (is_array($indexableContents[0]) && !empty($indexableContents[0])) { + $body = implode('', $indexableContents[0]); + } + } + + return $this->prepareBody($body); + } + + protected function prepareBody(string $body): string + { + $body = $this->stripTagsContent($body, '