diff --git a/code/StaticSiteContentExtractor.php b/code/StaticSiteContentExtractor.php index 325a481..d6bd55c 100644 --- a/code/StaticSiteContentExtractor.php +++ b/code/StaticSiteContentExtractor.php @@ -9,323 +9,344 @@ * Given a set of fieldnames and CSS selectors corresponding to them, a map of content * fields will be returned. */ -class StaticSiteContentExtractor extends Object { - - /** - * - * @var string - */ - protected $url = null; - - /** - * - * @var string - */ - protected $content = null; - - /** - * - * @var phpQueryObject - */ - protected $phpQuery = null; - - /** - * Set this by using the yml config system - * - * Example: - * - * StaticSiteContentExtractor: +class StaticSiteContentExtractor extends Object +{ + + /** + * + * @var string + */ + protected $url = null; + + /** + * + * @var string + */ + protected $content = null; + + /** + * + * @var phpQueryObject + */ + protected $phpQuery = null; + + /** + * Set this by using the yml config system + * + * Example: + * + * StaticSiteContentExtractor: * log_file: ../logs/import-log.txt - * - * - * @var string - */ - private static $log_file = null; - - /** - * Create a StaticSiteContentExtractor for a single URL/. - * - * @param string $url The absolute URL to extract content from - */ - public function __construct($url) { - $this->url = $url; - } - - /** - * Extract content for map of field => css-selector pairs - * - * @param array $selectorMap A map of field name => css-selector - * @return array A map of field name => array('selector' => selector, 'content' => field content) - */ - public function extractMapAndSelectors($selectorMap) { - - if(!$this->phpQuery) { - $this->fetchContent(); - } - - $output = array(); - - foreach($selectorMap as $fieldName => $extractionRules) { - if(!is_array($extractionRules)) { - $extractionRules = array($extractionRules); - } - - foreach($extractionRules as $extractionRule) { - if(!is_array($extractionRule)) { - $extractionRule = array('selector' => $extractionRule); - } - - $content = $this->extractField($extractionRule['selector'], $extractionRule['attribute'], $extractionRule['outerhtml']); - - if(!$content) { - continue; - } - - $content = $this->excludeContent($extractionRule['excludeselectors'], $extractionRule['selector'], $content); - - if(!$content) { - continue; - } - - if(!empty($extractionRule['plaintext'])) { - $content = Convert::html2raw($content); - } - - // We found a match, select that one and ignore any other selectors - $output[$fieldName] = $extractionRule; - $output[$fieldName]['content'] = $content; - $this->log("Value set for $fieldName"); - break; - } - } - return $output; - } - - /** - * Extract content for a single css selector - * - * @param string $cssSelector The selector for which to extract content. - * @param string $attribute If set, the value will be from this HTML attribute - * @param bool $outherHTML should we return the full HTML of the whole field - * @return string The content for that selector - */ - public function extractField($cssSelector, $attribute = null, $outerHTML = false) { - if(!$this->phpQuery) { - $this->fetchContent(); - } - - $elements = $this->phpQuery[$cssSelector]; - - // just return the inner HTML for this node - if(!$outerHTML || !$attribute) { - return trim($elements->html()); - } - - $result = ''; - foreach($elements as $element) { - // Get the full html for this element - if($outerHTML) { - $result .= $this->getOuterHTML($element); - // Get the value of a attribute - } elseif($attribute && trim($element->getAttribute($attribute))) { - $result .= ($element->getAttribute($attribute)).PHP_EOL; - } - } - - return trim($result); - } - - /** - * Strip away content from $content that matches one or many css selectors. - * - * @param array $excludeSelectors - * @param string $content - * @return string - */ - protected function excludeContent($excludeSelectors, $parentSelector, $content) { - if(!$excludeSelectors) { - return $content; - } - - foreach($excludeSelectors as $excludeSelector) { - if(!trim($excludeSelector)) { - continue; - } - $element = $this->phpQuery[$parentSelector.' '.$excludeSelector]; - if($element) { - $remove = $element->htmlOuter(); - $content = str_replace($remove, '', $content); - $this->log(' - Excluded content from "'.$parentSelector.' '.$excludeSelector.'"'); - } - } - return ($content); - } - - /** - * Get the full HTML of the element and its childs - * - * @param DOMElement $element - * @return string - */ - protected function getOuterHTML(DOMElement $element) { - $doc = new DOMDocument(); - $doc->formatOutput = false; - $doc->preserveWhiteSpace = true; - $doc->substituteEntities = false; - $doc->appendChild($doc->importNode($element, true)); - return $doc->saveHTML(); - } - - /** - * - * @return string - */ - public function getContent() { - return $this->content; - } - - /** - * Fetch the content and initialise $this->content and $this->phpQuery - * - * @return void - */ - protected function fetchContent() { - $this->log('Fetching ' . $this->url); - - $response = $this->curlRequest($this->url, "GET"); - $this->content = $response->getBody(); - $this->phpQuery = phpQuery::newDocument($this->content); - - //// Make the URLs all absolute - - // Useful parts of the URL - if(!preg_match('#^[a-z]+:#i', $this->url, $matches)) throw new Exception('Bad URL: ' . $this->url); - $protocol = $matches[0]; - - if(!preg_match('#^[a-z]+://[^/]+#i', $this->url, $matches)) throw new Exception('Bad URL: ' . $this->url); - $server = $matches[0]; - - $base = (substr($this->url,-1) == '/') ? $this->url : dirname($this->url) . '/'; - - $this->log('Rewriting links in content'); - - $rewriter = new StaticSiteLinkRewriter(function($url) use($protocol, $server, $base) { - // Absolute - if(preg_match('#^[a-z]+://[^/]+#i', $url) || substr($url,0,7) == 'mailto:') return $url; - - // Protocol relative - if(preg_match('#^//[^/]#i', $url)) return $protocol . $url; - - // Server relative - if($url[0] == "/") return $server . $url; - - // Relative - $result = $base . $url; - while(strpos($result, '/../') !== false) { - $result = preg_replace('#[^/]+/+../+#i','/', $result); - } - while(strpos($result, '/./') !== false) { - $result = str_replace('/./','/', $result); - } - return $result; - - }); - - #$rewriter->rewriteInPQ($this->phpQuery); - #echo($this->phpQuery->html()); - } - - /** - * Use cURL to request a URL, and return a SS_HTTPResponse object. - * - * @param string $url - * @param string $method - * @param string $data - * @param string $headers - * @param array $curlOptions - * @return \SS_HTTPResponse - */ - protected function curlRequest($url, $method, $data = null, $headers = null, $curlOptions = array()) { - $ch = curl_init(); - $timeout = 5; - $ssInfo = new SapphireInfo; - $useragent = 'SilverStripe/' . $ssInfo->version(); - - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); - curl_setopt($ch, CURLOPT_USERAGENT, $useragent); - curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); - curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method); - curl_setopt($ch, CURLOPT_HEADER, 1); - - if($headers) curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); - - // Add fields to POST and PUT requests - if($method == 'POST') { - curl_setopt($ch, CURLOPT_POST, 1); - curl_setopt($ch, CURLOPT_POSTFIELDS, $data); - } elseif($method == 'PUT') { - $put = fopen("php://temp", 'r+'); - fwrite($put, $data); - fseek($put, 0); - - curl_setopt($ch, CURLOPT_PUT, 1); - curl_setopt($ch, CURLOPT_INFILE, $put); - curl_setopt($ch, CURLOPT_INFILESIZE, strlen($data)); - } - - // Follow redirects - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); - - // Set any custom options passed to the request() function - curl_setopt_array($ch, $curlOptions); - - // Run request - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); - $fullResponseBody = curl_exec($ch); - $curlError = curl_error($ch); - - list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r","",$fullResponseBody), 2); - if(preg_match("#^HTTP/1.1 100#", $responseHeaders)) { - list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r","",$responseBody), 2); - } - - $responseHeaders = explode("\n", trim($responseHeaders)); - array_shift($responseHeaders); - - $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); - curl_close($ch); - - if($curlError !== '' || $statusCode == 0) { - $statusCode = 500; - } - - $response = new SS_HTTPResponse($responseBody, $statusCode); - foreach($responseHeaders as $headerLine) { - if(strpos($headerLine, ":") !== false) { - list($headerName, $headerVal) = explode(":", $headerLine, 2); - $response->addHeader(trim($headerName), trim($headerVal)); - } - } - - - return $response; - } - - /** - * Log a message if the logging has been setup according to docs - * - * @param string $message - * @return void - */ - protected function log($message) { - $logFile = Config::inst()->get('StaticSiteContentExtractor','log_file'); - if(!$logFile) { - return; - } - - if(is_writable($logFile) || !file_exists($logFile) && is_writable(dirname($logFile))) { - error_log($message . "\n", 3, $logFile); - } - } + * + * + * @var string + */ + private static $log_file = null; + + /** + * Create a StaticSiteContentExtractor for a single URL/. + * + * @param string $url The absolute URL to extract content from + */ + public function __construct($url) + { + $this->url = $url; + } + + /** + * Extract content for map of field => css-selector pairs + * + * @param array $selectorMap A map of field name => css-selector + * @return array A map of field name => array('selector' => selector, 'content' => field content) + */ + public function extractMapAndSelectors($selectorMap) + { + if (!$this->phpQuery) { + $this->fetchContent(); + } + + $output = array(); + + foreach ($selectorMap as $fieldName => $extractionRules) { + if (!is_array($extractionRules)) { + $extractionRules = array($extractionRules); + } + + foreach ($extractionRules as $extractionRule) { + if (!is_array($extractionRule)) { + $extractionRule = array('selector' => $extractionRule); + } + + $content = $this->extractField($extractionRule['selector'], $extractionRule['attribute'], $extractionRule['outerhtml']); + + if (!$content) { + continue; + } + + $content = $this->excludeContent($extractionRule['excludeselectors'], $extractionRule['selector'], $content); + + if (!$content) { + continue; + } + + if (!empty($extractionRule['plaintext'])) { + $content = Convert::html2raw($content); + } + + // We found a match, select that one and ignore any other selectors + $output[$fieldName] = $extractionRule; + $output[$fieldName]['content'] = $content; + $this->log("Value set for $fieldName"); + break; + } + } + return $output; + } + + /** + * Extract content for a single css selector + * + * @param string $cssSelector The selector for which to extract content. + * @param string $attribute If set, the value will be from this HTML attribute + * @param bool $outherHTML should we return the full HTML of the whole field + * @return string The content for that selector + */ + public function extractField($cssSelector, $attribute = null, $outerHTML = false) + { + if (!$this->phpQuery) { + $this->fetchContent(); + } + + $elements = $this->phpQuery[$cssSelector]; + + // just return the inner HTML for this node + if (!$outerHTML || !$attribute) { + return trim($elements->html()); + } + + $result = ''; + foreach ($elements as $element) { + // Get the full html for this element + if ($outerHTML) { + $result .= $this->getOuterHTML($element); + // Get the value of a attribute + } elseif ($attribute && trim($element->getAttribute($attribute))) { + $result .= ($element->getAttribute($attribute)).PHP_EOL; + } + } + + return trim($result); + } + + /** + * Strip away content from $content that matches one or many css selectors. + * + * @param array $excludeSelectors + * @param string $content + * @return string + */ + protected function excludeContent($excludeSelectors, $parentSelector, $content) + { + if (!$excludeSelectors) { + return $content; + } + + foreach ($excludeSelectors as $excludeSelector) { + if (!trim($excludeSelector)) { + continue; + } + $element = $this->phpQuery[$parentSelector.' '.$excludeSelector]; + if ($element) { + $remove = $element->htmlOuter(); + $content = str_replace($remove, '', $content); + $this->log(' - Excluded content from "'.$parentSelector.' '.$excludeSelector.'"'); + } + } + return ($content); + } + + /** + * Get the full HTML of the element and its childs + * + * @param DOMElement $element + * @return string + */ + protected function getOuterHTML(DOMElement $element) + { + $doc = new DOMDocument(); + $doc->formatOutput = false; + $doc->preserveWhiteSpace = true; + $doc->substituteEntities = false; + $doc->appendChild($doc->importNode($element, true)); + return $doc->saveHTML(); + } + + /** + * + * @return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Fetch the content and initialise $this->content and $this->phpQuery + * + * @return void + */ + protected function fetchContent() + { + $this->log('Fetching ' . $this->url); + + $response = $this->curlRequest($this->url, "GET"); + $this->content = $response->getBody(); + $this->phpQuery = phpQuery::newDocument($this->content); + + //// Make the URLs all absolute + + // Useful parts of the URL + if (!preg_match('#^[a-z]+:#i', $this->url, $matches)) { + throw new Exception('Bad URL: ' . $this->url); + } + $protocol = $matches[0]; + + if (!preg_match('#^[a-z]+://[^/]+#i', $this->url, $matches)) { + throw new Exception('Bad URL: ' . $this->url); + } + $server = $matches[0]; + + $base = (substr($this->url, -1) == '/') ? $this->url : dirname($this->url) . '/'; + + $this->log('Rewriting links in content'); + + $rewriter = new StaticSiteLinkRewriter(function ($url) use ($protocol, $server, $base) { + // Absolute + if (preg_match('#^[a-z]+://[^/]+#i', $url) || substr($url, 0, 7) == 'mailto:') { + return $url; + } + + // Protocol relative + if (preg_match('#^//[^/]#i', $url)) { + return $protocol . $url; + } + + // Server relative + if ($url[0] == "/") { + return $server . $url; + } + + // Relative + $result = $base . $url; + while (strpos($result, '/../') !== false) { + $result = preg_replace('#[^/]+/+../+#i', '/', $result); + } + while (strpos($result, '/./') !== false) { + $result = str_replace('/./', '/', $result); + } + return $result; + + }); + + #$rewriter->rewriteInPQ($this->phpQuery); + #echo($this->phpQuery->html()); + } + + /** + * Use cURL to request a URL, and return a SS_HTTPResponse object. + * + * @param string $url + * @param string $method + * @param string $data + * @param string $headers + * @param array $curlOptions + * @return \SS_HTTPResponse + */ + protected function curlRequest($url, $method, $data = null, $headers = null, $curlOptions = array()) + { + $ch = curl_init(); + $timeout = 5; + $ssInfo = new SapphireInfo; + $useragent = 'SilverStripe/' . $ssInfo->version(); + + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); + curl_setopt($ch, CURLOPT_USERAGENT, $useragent); + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); + curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method); + curl_setopt($ch, CURLOPT_HEADER, 1); + + if ($headers) { + curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); + } + + // Add fields to POST and PUT requests + if ($method == 'POST') { + curl_setopt($ch, CURLOPT_POST, 1); + curl_setopt($ch, CURLOPT_POSTFIELDS, $data); + } elseif ($method == 'PUT') { + $put = fopen("php://temp", 'r+'); + fwrite($put, $data); + fseek($put, 0); + + curl_setopt($ch, CURLOPT_PUT, 1); + curl_setopt($ch, CURLOPT_INFILE, $put); + curl_setopt($ch, CURLOPT_INFILESIZE, strlen($data)); + } + + // Follow redirects + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + + // Set any custom options passed to the request() function + curl_setopt_array($ch, $curlOptions); + + // Run request + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + $fullResponseBody = curl_exec($ch); + $curlError = curl_error($ch); + + list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $fullResponseBody), 2); + if (preg_match("#^HTTP/1.1 100#", $responseHeaders)) { + list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $responseBody), 2); + } + + $responseHeaders = explode("\n", trim($responseHeaders)); + array_shift($responseHeaders); + + $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + + if ($curlError !== '' || $statusCode == 0) { + $statusCode = 500; + } + + $response = new SS_HTTPResponse($responseBody, $statusCode); + foreach ($responseHeaders as $headerLine) { + if (strpos($headerLine, ":") !== false) { + list($headerName, $headerVal) = explode(":", $headerLine, 2); + $response->addHeader(trim($headerName), trim($headerVal)); + } + } + + + return $response; + } + + /** + * Log a message if the logging has been setup according to docs + * + * @param string $message + * @return void + */ + protected function log($message) + { + $logFile = Config::inst()->get('StaticSiteContentExtractor', 'log_file'); + if (!$logFile) { + return; + } + + if (is_writable($logFile) || !file_exists($logFile) && is_writable(dirname($logFile))) { + error_log($message . "\n", 3, $logFile); + } + } } diff --git a/code/StaticSiteContentItem.php b/code/StaticSiteContentItem.php index 56ffc0b..8bca9fa 100644 --- a/code/StaticSiteContentItem.php +++ b/code/StaticSiteContentItem.php @@ -1,69 +1,81 @@ externalId; - - $processedURL = $this->source->urlList()->processedURL($url); - $parentURL = $this->source->urlList()->parentProcessedURL($processedURL); - - $subURL = substr($processedURL, strlen($parentURL)); - if($subURL != "/") $subURL = preg_replace('#(^/)|(/$)#','',$subURL); - - $this->Name = $subURL; - $this->Title = $this->Name; - $this->AbsoluteURL = preg_replace('#/$#','', $this->source->BaseUrl) . $this->externalId; - $this->ProcessedURL = $processedURL; - } - - public function stageChildren($showAll = false) { - if(!$this->source->urlList()->hasCrawled()) return new ArrayList; - - $childrenURLs = $this->source->urlList()->getChildren($this->externalId); - - $children = new ArrayList; - foreach($childrenURLs as $child) { - $children->push($this->source->getObject($child)); - } - - return $children; - } - - public function numChildren() { - if(!$this->source->urlList()->hasCrawled()) return 0; - - return sizeof($this->source->urlList()->getChildren($this->externalId)); - } - - public function getType() { - return "sitetree"; - } - - public function getCMSFields() { - $fields = parent::getCMSFields(); - - // Add the preview fields here, including rules used - $t = new StaticSitePageTransformer; - - $urlField = new ReadonlyField("PreviewSourceURL", "Imported from", - "AbsoluteURL\">" . Convert::raw2xml($this->AbsoluteURL) . ""); - $urlField->dontEscape = true; - - $fields->addFieldToTab("Root.Preview", $urlField); - - $content = $t->getContentFieldsAndSelectors($this); - if(count($content) === 0) { - return $fields; - } - foreach($content as $k => $v) { - $readonlyField = new ReadonlyField("Preview$k", "$k
\n" . $v['selector'] . "", $v['content']); - $readonlyField->addExtraClass('readonly-click-toggle'); - $fields->addFieldToTab("Root.Preview", $readonlyField); - } - - Requirements::javascript('staticsiteconnector/js/StaticSiteContentItem.js'); - Requirements::css('staticsiteconnector/css/StaticSiteContentItem.css'); - - return $fields; - } -} \ No newline at end of file +class StaticSiteContentItem extends ExternalContentItem +{ + public function init() + { + $url = $this->externalId; + + $processedURL = $this->source->urlList()->processedURL($url); + $parentURL = $this->source->urlList()->parentProcessedURL($processedURL); + + $subURL = substr($processedURL, strlen($parentURL)); + if ($subURL != "/") { + $subURL = preg_replace('#(^/)|(/$)#', '', $subURL); + } + + $this->Name = $subURL; + $this->Title = $this->Name; + $this->AbsoluteURL = preg_replace('#/$#', '', $this->source->BaseUrl) . $this->externalId; + $this->ProcessedURL = $processedURL; + } + + public function stageChildren($showAll = false) + { + if (!$this->source->urlList()->hasCrawled()) { + return new ArrayList; + } + + $childrenURLs = $this->source->urlList()->getChildren($this->externalId); + + $children = new ArrayList; + foreach ($childrenURLs as $child) { + $children->push($this->source->getObject($child)); + } + + return $children; + } + + public function numChildren() + { + if (!$this->source->urlList()->hasCrawled()) { + return 0; + } + + return sizeof($this->source->urlList()->getChildren($this->externalId)); + } + + public function getType() + { + return "sitetree"; + } + + public function getCMSFields() + { + $fields = parent::getCMSFields(); + + // Add the preview fields here, including rules used + $t = new StaticSitePageTransformer; + + $urlField = new ReadonlyField("PreviewSourceURL", "Imported from", + "AbsoluteURL\">" . Convert::raw2xml($this->AbsoluteURL) . ""); + $urlField->dontEscape = true; + + $fields->addFieldToTab("Root.Preview", $urlField); + + $content = $t->getContentFieldsAndSelectors($this); + if (count($content) === 0) { + return $fields; + } + foreach ($content as $k => $v) { + $readonlyField = new ReadonlyField("Preview$k", "$k
\n" . $v['selector'] . "", $v['content']); + $readonlyField->addExtraClass('readonly-click-toggle'); + $fields->addFieldToTab("Root.Preview", $readonlyField); + } + + Requirements::javascript('staticsiteconnector/js/StaticSiteContentItem.js'); + Requirements::css('staticsiteconnector/css/StaticSiteContentItem.css'); + + return $fields; + } +} diff --git a/code/StaticSiteContentSource.php b/code/StaticSiteContentSource.php index 412136d..8caf79f 100644 --- a/code/StaticSiteContentSource.php +++ b/code/StaticSiteContentSource.php @@ -1,377 +1,401 @@ 'Varchar(255)', - 'UrlProcessor' => 'Varchar(255)', - 'ExtraCrawlUrls' => 'Text', - 'UrlExcludePatterns' => 'Text', - ); - - public static $has_many = array( - "Schemas" => "StaticSiteContentSource_ImportSchema", - "Pages" => "SiteTree", - ); - - - public function getCMSFields() { - $fields = parent::getCMSFields(); - - $importRules = $fields->dataFieldByName('Schemas'); - $importRules->getConfig()->removeComponentsByType('GridFieldAddExistingAutocompleter'); - $importRules->getConfig()->removeComponentsByType('GridFieldAddNewButton'); - $addNewButton = new GridFieldAddNewButton('after'); - $addNewButton->setButtonName("Add schema"); - $importRules->getConfig()->addComponent($addNewButton); - - $fields->removeFieldFromTab("Root", "Schemas"); - $fields->removeFieldFromTab("Root", "Pages"); - $fields->addFieldToTab("Root.Main", new LiteralField("", "

Each import rule will import content for a field" - . " by getting the results of a CSS selector. If more than one rule exists for a field, then they will be" - . " processed in the order they appear. The first rule that returns content will be the one used.

")); - $fields->addFieldToTab("Root.Main", $importRules); - - $processingOptions = array("" => "No pre-processing"); - foreach(ClassInfo::implementorsOf('StaticSiteUrlProcessor') as $processor) { - $processorObj = new $processor; - $processingOptions[$processor] = "" . Convert::raw2xml($processorObj->getName()) - . "
" . Convert::raw2xml($processorObj->getDescription()); - } - - $fields->addFieldToTab("Root.Main", new OptionsetField("UrlProcessor", "URL processing", $processingOptions)); - - - switch($this->urlList()->getSpiderStatus()) { - case "Not started": - $crawlButtonText = _t('StaticSiteContentSource.CRAWL_SITE', 'Crawl site'); - break; - - case "Partial": - $crawlButtonText = _t('StaticSiteContentSource.RESUME_CRAWLING', 'Resume crawling'); - break; - - case "Complete": - $crawlButtonText = _t('StaticSiteContentSource.RECRAWL_SITE', 'Re-crawl site'); - break; - - default: - throw new LogicException("Invalid getSpiderStatus() value '".$this->urlList()->getSpiderStatus().";"); - } - - - $crawlButton = FormAction::create('crawlsite', $crawlButtonText) - ->setAttribute('data-icon', 'arrow-circle-double') - ->setUseButtonTag(true); - $fields->addFieldsToTab('Root.Crawl', array( - new ReadonlyField("CrawlStatus", "Crawling Status", $this->urlList()->getSpiderStatus()), - new ReadonlyField("NumURLs", "Number of URLs", $this->urlList()->getNumURLs()), - - new LiteralField('CrawlActions', - "

Before importing this content, all URLs on the site must be crawled (like a search engine does). Click" - . " the button below to do so:

" - . "
{$crawlButton->forTemplate()}
") - )); - - if($this->urlList()->getSpiderStatus() == "Complete") { - $urlsAsUL = ""; - - $fields->addFieldToTab('Root.Crawl', - new LiteralField('CrawlURLList', "

The following URLs have been identified:

" . $urlsAsUL) - ); - - - } - - $fields->dataFieldByName("ExtraCrawlUrls") - ->setDescription("Add URLs that are not reachable through content scraping, eg: '/about/team'. One per line") - ->setTitle('Additional URLs'); - $fields->dataFieldByName("UrlExcludePatterns") - ->setDescription("URLs that should be excluded (support regular expression). eg: '/about/.*'. One per URL") - ->setTitle('Excluded URLs'); - - return $fields; - } - - public function onAfterWrite() { - parent::onAfterWrite(); - - $urlList = $this->urlList(); - if($this->isChanged('UrlProcessor') && $urlList->hasCrawled()) { - if($processorClass = $this->UrlProcessor) { - $urlList->setUrlProcessor(new $processorClass); - } else { - $urlList->setUrlProcessor(null); - } - $urlList->reprocessUrls(); - } - } - - - public function urlList() { - if(!$this->urlList) { - $this->urlList = new StaticSiteUrlList($this->BaseUrl, "../assets/static-site-" . $this->ID); - if($processorClass = $this->UrlProcessor) { - $this->urlList->setUrlProcessor(new $processorClass); - } - if($this->ExtraCrawlUrls) { - $extraCrawlUrls = preg_split('/\s+/', trim($this->ExtraCrawlUrls)); - $this->urlList->setExtraCrawlUrls($extraCrawlUrls); - } - if($this->UrlExcludePatterns) { - $urlExcludePatterns = preg_split('/\s+/', trim($this->UrlExcludePatterns)); - $this->urlList->setExcludePatterns($urlExcludePatterns); - } - } - return $this->urlList; - } - - /** - * Crawl the target site - * @return StaticSiteCrawler - */ - public function crawl($limit=false, $verbose=false) { - if(!$this->BaseUrl) throw new LogicException("Can't crawl a site until Base URL is set."); - return $this->urlList()->crawl($limit, $verbose); - } - - public function getSchemaForURL($absoluteURL) { - // TODO: Return the right schema - return $this->Schemas()->First(); - } - - /** - * Returns a StaticSiteContentItem for the given URL. - * Relative URLs are used as the unique identifiers by this importer - * - * @param $id The URL, relative to BaseURL, starting with "/". - * @return DataObject - */ - public function getObject($id) { - - if($id[0] != "/") { - $id = $this->decodeId($id); - if($id[0] != "/") throw new InvalidArgumentException("\$id must start with /"); - } - - return new StaticSiteContentItem($this, $id); - } - - public function getRoot() { - return $this->getObject('/'); - } - - public function allowedImportTargets() { - return array('sitetree' => true); - } - - /** - * Return the root node - * @return ArrayList A list containing the root node - */ - public function stageChildren($showAll = false) { - if(!$this->urlList()->hasCrawled()) return new ArrayList; - - return new ArrayList(array( - $this->getObject("/") - )); - - } - - public function getContentImporter($target=null) { - return new StaticSiteImporter(); - } - - public function isValid() { - if(!(boolean)$this->BaseUrl) { - return false; - } - return true; - } - public function canImport($member = null) { - return $this->isValid(); - } - public function canCreate($member = null) { - return true; - } - +class StaticSiteContentSource extends ExternalContentSource +{ + + public static $db = array( + 'BaseUrl' => 'Varchar(255)', + 'UrlProcessor' => 'Varchar(255)', + 'ExtraCrawlUrls' => 'Text', + 'UrlExcludePatterns' => 'Text', + ); + + public static $has_many = array( + "Schemas" => "StaticSiteContentSource_ImportSchema", + "Pages" => "SiteTree", + ); + + + public function getCMSFields() + { + $fields = parent::getCMSFields(); + + $importRules = $fields->dataFieldByName('Schemas'); + $importRules->getConfig()->removeComponentsByType('GridFieldAddExistingAutocompleter'); + $importRules->getConfig()->removeComponentsByType('GridFieldAddNewButton'); + $addNewButton = new GridFieldAddNewButton('after'); + $addNewButton->setButtonName("Add schema"); + $importRules->getConfig()->addComponent($addNewButton); + + $fields->removeFieldFromTab("Root", "Schemas"); + $fields->removeFieldFromTab("Root", "Pages"); + $fields->addFieldToTab("Root.Main", new LiteralField("", "

Each import rule will import content for a field" + . " by getting the results of a CSS selector. If more than one rule exists for a field, then they will be" + . " processed in the order they appear. The first rule that returns content will be the one used.

")); + $fields->addFieldToTab("Root.Main", $importRules); + + $processingOptions = array("" => "No pre-processing"); + foreach (ClassInfo::implementorsOf('StaticSiteUrlProcessor') as $processor) { + $processorObj = new $processor; + $processingOptions[$processor] = "" . Convert::raw2xml($processorObj->getName()) + . "
" . Convert::raw2xml($processorObj->getDescription()); + } + + $fields->addFieldToTab("Root.Main", new OptionsetField("UrlProcessor", "URL processing", $processingOptions)); + + + switch ($this->urlList()->getSpiderStatus()) { + case "Not started": + $crawlButtonText = _t('StaticSiteContentSource.CRAWL_SITE', 'Crawl site'); + break; + + case "Partial": + $crawlButtonText = _t('StaticSiteContentSource.RESUME_CRAWLING', 'Resume crawling'); + break; + + case "Complete": + $crawlButtonText = _t('StaticSiteContentSource.RECRAWL_SITE', 'Re-crawl site'); + break; + + default: + throw new LogicException("Invalid getSpiderStatus() value '".$this->urlList()->getSpiderStatus().";"); + } + + + $crawlButton = FormAction::create('crawlsite', $crawlButtonText) + ->setAttribute('data-icon', 'arrow-circle-double') + ->setUseButtonTag(true); + $fields->addFieldsToTab('Root.Crawl', array( + new ReadonlyField("CrawlStatus", "Crawling Status", $this->urlList()->getSpiderStatus()), + new ReadonlyField("NumURLs", "Number of URLs", $this->urlList()->getNumURLs()), + + new LiteralField('CrawlActions', + "

Before importing this content, all URLs on the site must be crawled (like a search engine does). Click" + . " the button below to do so:

" + . "
{$crawlButton->forTemplate()}
") + )); + + if ($this->urlList()->getSpiderStatus() == "Complete") { + $urlsAsUL = ""; + + $fields->addFieldToTab('Root.Crawl', + new LiteralField('CrawlURLList', "

The following URLs have been identified:

" . $urlsAsUL) + ); + } + + $fields->dataFieldByName("ExtraCrawlUrls") + ->setDescription("Add URLs that are not reachable through content scraping, eg: '/about/team'. One per line") + ->setTitle('Additional URLs'); + $fields->dataFieldByName("UrlExcludePatterns") + ->setDescription("URLs that should be excluded (support regular expression). eg: '/about/.*'. One per URL") + ->setTitle('Excluded URLs'); + + return $fields; + } + + public function onAfterWrite() + { + parent::onAfterWrite(); + + $urlList = $this->urlList(); + if ($this->isChanged('UrlProcessor') && $urlList->hasCrawled()) { + if ($processorClass = $this->UrlProcessor) { + $urlList->setUrlProcessor(new $processorClass); + } else { + $urlList->setUrlProcessor(null); + } + $urlList->reprocessUrls(); + } + } + + + public function urlList() + { + if (!$this->urlList) { + $this->urlList = new StaticSiteUrlList($this->BaseUrl, "../assets/static-site-" . $this->ID); + if ($processorClass = $this->UrlProcessor) { + $this->urlList->setUrlProcessor(new $processorClass); + } + if ($this->ExtraCrawlUrls) { + $extraCrawlUrls = preg_split('/\s+/', trim($this->ExtraCrawlUrls)); + $this->urlList->setExtraCrawlUrls($extraCrawlUrls); + } + if ($this->UrlExcludePatterns) { + $urlExcludePatterns = preg_split('/\s+/', trim($this->UrlExcludePatterns)); + $this->urlList->setExcludePatterns($urlExcludePatterns); + } + } + return $this->urlList; + } + + /** + * Crawl the target site + * @return StaticSiteCrawler + */ + public function crawl($limit=false, $verbose=false) + { + if (!$this->BaseUrl) { + throw new LogicException("Can't crawl a site until Base URL is set."); + } + return $this->urlList()->crawl($limit, $verbose); + } + + public function getSchemaForURL($absoluteURL) + { + // TODO: Return the right schema + return $this->Schemas()->First(); + } + + /** + * Returns a StaticSiteContentItem for the given URL. + * Relative URLs are used as the unique identifiers by this importer + * + * @param $id The URL, relative to BaseURL, starting with "/". + * @return DataObject + */ + public function getObject($id) + { + if ($id[0] != "/") { + $id = $this->decodeId($id); + if ($id[0] != "/") { + throw new InvalidArgumentException("\$id must start with /"); + } + } + + return new StaticSiteContentItem($this, $id); + } + + public function getRoot() + { + return $this->getObject('/'); + } + + public function allowedImportTargets() + { + return array('sitetree' => true); + } + + /** + * Return the root node + * @return ArrayList A list containing the root node + */ + public function stageChildren($showAll = false) + { + if (!$this->urlList()->hasCrawled()) { + return new ArrayList; + } + + return new ArrayList(array( + $this->getObject("/") + )); + } + + public function getContentImporter($target=null) + { + return new StaticSiteImporter(); + } + + public function isValid() + { + if (!(boolean)$this->BaseUrl) { + return false; + } + return true; + } + public function canImport($member = null) + { + return $this->isValid(); + } + public function canCreate($member = null) + { + return true; + } } /** * A collection of ImportRules that apply to some or all of the pages being imported. */ -class StaticSiteContentSource_ImportSchema extends DataObject { - public static $db = array( - "DataType" => "Varchar", // classname - "Order" => "Int", - "AppliesTo" => "Varchar(255)", // regex - ); - public static $summary_fields = array( - "AppliesTo", - "DataType", - "Order", - ); - public static $field_labels = array( - "AppliesTo" => "URLs applied to", - "DataType" => "Data type", - "Order" => "Priority", - ); - - public static $default_sort = "Order"; - - public static $has_one = array( - "ContentSource" => "StaticSiteContentSource", - ); - - public static $has_many = array( - "ImportRules" => "StaticSiteContentSource_ImportRule", - ); - - public function getTitle() { - return $this->DataType.' ('.$this->AppliesTo.')'; - } - - /** - * - * @return FieldList - */ - public function getCMSFields() { - $fields = parent::getCMSFields(); - $fields->removeFieldFromTab('Root.Main', 'DataType'); - $fields->removeByName('ContentSourceID'); - $dataObjects = ClassInfo::subclassesFor('DataObject'); - array_shift($dataObjects); - natcasesort($dataObjects); - $fields->addFieldToTab('Root.Main', new DropdownField('DataType', 'DataType', $dataObjects)); - - $importRules = $fields->dataFieldByName('ImportRules'); - if($importRules) { - $importRules->getConfig()->removeComponentsByType('GridFieldAddExistingAutocompleter'); - $importRules->getConfig()->removeComponentsByType('GridFieldAddNewButton'); - $addNewButton = new GridFieldAddNewButton('after'); - $addNewButton->setButtonName("Add Rule"); - $importRules->getConfig()->addComponent($addNewButton); - - $fields->removeFieldFromTab('Root', 'ImportRules'); - $fields->addFieldToTab('Root.Main', $importRules); - } - - return $fields; - } - - public function requireDefaultRecords() { - foreach(StaticSiteContentSource::get() as $source) { - if(!$source->Schemas()->count()) { - Debug::message("Making a schema for $source->ID"); - $defaultSchema = new StaticSiteContentSource_ImportSchema; - $defaultSchema->Order = 1000000; - $defaultSchema->AppliesTo = ".*"; - $defaultSchema->DataType = "Page"; - $defaultSchema->ContentSourceID = $source->ID; - $defaultSchema->write(); - - - foreach(StaticSiteContentSource_ImportRule::get()->filter(array('SchemaID' => 0)) as $rule) { - $rule->SchemaID = $defaultSchema->ID; - $rule->write(); - } - } - } - } - - /** - * Return the import rules in a format suitable for configuring StaticSiteContentExtractor. - * - * @return array A map of field name => array(CSS selector, CSS selector, ...) - */ - public function getImportRules() { - $output = array(); - - foreach($this->ImportRules() as $rule) { - if(!isset($output[$rule->FieldName])) $output[$rule->FieldName] = array(); - $ruleArray = array( - 'selector' => $rule->CSSSelector, - 'attribute' => $rule->Attribute, - 'plaintext' => $rule->PlainText, - 'excludeselectors' => preg_split('/\s+/', trim($rule->ExcludeCSSSelector)), - 'outerhtml' => $rule->OuterHTML, - ); - $output[$rule->FieldName][] = $ruleArray; - } - - return $output; - } - +class StaticSiteContentSource_ImportSchema extends DataObject +{ + public static $db = array( + "DataType" => "Varchar", // classname + "Order" => "Int", + "AppliesTo" => "Varchar(255)", // regex + ); + public static $summary_fields = array( + "AppliesTo", + "DataType", + "Order", + ); + public static $field_labels = array( + "AppliesTo" => "URLs applied to", + "DataType" => "Data type", + "Order" => "Priority", + ); + + public static $default_sort = "Order"; + + public static $has_one = array( + "ContentSource" => "StaticSiteContentSource", + ); + + public static $has_many = array( + "ImportRules" => "StaticSiteContentSource_ImportRule", + ); + + public function getTitle() + { + return $this->DataType.' ('.$this->AppliesTo.')'; + } + + /** + * + * @return FieldList + */ + public function getCMSFields() + { + $fields = parent::getCMSFields(); + $fields->removeFieldFromTab('Root.Main', 'DataType'); + $fields->removeByName('ContentSourceID'); + $dataObjects = ClassInfo::subclassesFor('DataObject'); + array_shift($dataObjects); + natcasesort($dataObjects); + $fields->addFieldToTab('Root.Main', new DropdownField('DataType', 'DataType', $dataObjects)); + + $importRules = $fields->dataFieldByName('ImportRules'); + if ($importRules) { + $importRules->getConfig()->removeComponentsByType('GridFieldAddExistingAutocompleter'); + $importRules->getConfig()->removeComponentsByType('GridFieldAddNewButton'); + $addNewButton = new GridFieldAddNewButton('after'); + $addNewButton->setButtonName("Add Rule"); + $importRules->getConfig()->addComponent($addNewButton); + + $fields->removeFieldFromTab('Root', 'ImportRules'); + $fields->addFieldToTab('Root.Main', $importRules); + } + + return $fields; + } + + public function requireDefaultRecords() + { + foreach (StaticSiteContentSource::get() as $source) { + if (!$source->Schemas()->count()) { + Debug::message("Making a schema for $source->ID"); + $defaultSchema = new StaticSiteContentSource_ImportSchema; + $defaultSchema->Order = 1000000; + $defaultSchema->AppliesTo = ".*"; + $defaultSchema->DataType = "Page"; + $defaultSchema->ContentSourceID = $source->ID; + $defaultSchema->write(); + + + foreach (StaticSiteContentSource_ImportRule::get()->filter(array('SchemaID' => 0)) as $rule) { + $rule->SchemaID = $defaultSchema->ID; + $rule->write(); + } + } + } + } + + /** + * Return the import rules in a format suitable for configuring StaticSiteContentExtractor. + * + * @return array A map of field name => array(CSS selector, CSS selector, ...) + */ + public function getImportRules() + { + $output = array(); + + foreach ($this->ImportRules() as $rule) { + if (!isset($output[$rule->FieldName])) { + $output[$rule->FieldName] = array(); + } + $ruleArray = array( + 'selector' => $rule->CSSSelector, + 'attribute' => $rule->Attribute, + 'plaintext' => $rule->PlainText, + 'excludeselectors' => preg_split('/\s+/', trim($rule->ExcludeCSSSelector)), + 'outerhtml' => $rule->OuterHTML, + ); + $output[$rule->FieldName][] = $ruleArray; + } + + return $output; + } } /** * A single import rule that forms part of an ImportSchema */ -class StaticSiteContentSource_ImportRule extends DataObject { - public static $db = array( - "FieldName" => "Varchar", - "CSSSelector" => "Text", - "ExcludeCSSSelector" => "Text", - "Attribute" => "Varchar", - "PlainText" => "Boolean", - "OuterHTML" => "Boolean", - ); - - public static $summary_fields = array( - "FieldName", - "CSSSelector", - "Attribute", - "PlainText", - "OuterHTML", - ); - - public static $field_labels = array( - "FieldName" => "Field Name", - "CSSSelector" => "CSS Selector", - "Attribute" => "Element attribute", - "PlainText" => "Convert to plain text", - "OuterHTML" => "Use the outer HTML", - ); - - public static $has_one = array( - "Schema" => "StaticSiteContentSource_ImportSchema", - ); - - public function getTitle() { - return ($this->FieldName)?$this->FieldName:$this->ID; - } - - /** - * - * @return FieldList - */ - public function getCMSFields() { - $fields = parent::getCMSFields(); - - $dataType = $this->Schema()->DataType; - if($dataType) { - $fieldList = singleton($dataType)->inheritedDatabaseFields(); - $fieldList = array_combine(array_keys($fieldList),array_keys($fieldList)); - unset($fieldList->ParentID); - unset($fieldList->WorkflowDefinitionID); - unset($fieldList->Version); - - $fieldNameField = new DropdownField("FieldName", "Field Name", $fieldList); - $fieldNameField->setEmptyString("(choose)"); - $fields->insertBefore($fieldNameField, "CSSSelector"); - } else { - $fields->replaceField('FieldName', $fieldName = new ReadonlyField("FieldName", "Field Name")); - $fieldName->setDescription('Save this rule before being able to add a field name'); - } - - return $fields; - } -} \ No newline at end of file +class StaticSiteContentSource_ImportRule extends DataObject +{ + public static $db = array( + "FieldName" => "Varchar", + "CSSSelector" => "Text", + "ExcludeCSSSelector" => "Text", + "Attribute" => "Varchar", + "PlainText" => "Boolean", + "OuterHTML" => "Boolean", + ); + + public static $summary_fields = array( + "FieldName", + "CSSSelector", + "Attribute", + "PlainText", + "OuterHTML", + ); + + public static $field_labels = array( + "FieldName" => "Field Name", + "CSSSelector" => "CSS Selector", + "Attribute" => "Element attribute", + "PlainText" => "Convert to plain text", + "OuterHTML" => "Use the outer HTML", + ); + + public static $has_one = array( + "Schema" => "StaticSiteContentSource_ImportSchema", + ); + + public function getTitle() + { + return ($this->FieldName)?$this->FieldName:$this->ID; + } + + /** + * + * @return FieldList + */ + public function getCMSFields() + { + $fields = parent::getCMSFields(); + + $dataType = $this->Schema()->DataType; + if ($dataType) { + $fieldList = singleton($dataType)->inheritedDatabaseFields(); + $fieldList = array_combine(array_keys($fieldList), array_keys($fieldList)); + unset($fieldList->ParentID); + unset($fieldList->WorkflowDefinitionID); + unset($fieldList->Version); + + $fieldNameField = new DropdownField("FieldName", "Field Name", $fieldList); + $fieldNameField->setEmptyString("(choose)"); + $fields->insertBefore($fieldNameField, "CSSSelector"); + } else { + $fields->replaceField('FieldName', $fieldName = new ReadonlyField("FieldName", "Field Name")); + $fieldName->setDescription('Save this rule before being able to add a field name'); + } + + return $fields; + } +} diff --git a/code/StaticSiteDataExtension.php b/code/StaticSiteDataExtension.php index effd33e..8f78b59 100644 --- a/code/StaticSiteDataExtension.php +++ b/code/StaticSiteDataExtension.php @@ -1,16 +1,18 @@ "StaticSiteContentSource", - ); - static $db = array( - "StaticSiteURL" => "Varchar(255)", - ); +class StaticSiteDataExtension extends DataExtension +{ + public static $has_one = array( + "StaticSiteContentSource" => "StaticSiteContentSource", + ); + public static $db = array( + "StaticSiteURL" => "Varchar(255)", + ); - function updateCMSFields(FieldList $fields) { - if($this->owner->StaticSiteContentSourceID && $this->owner->StaticSiteURL) { - $fields->addFieldToTab('Root.Main', new ReadonlyField('StaticSiteURL', 'Imported URL'), 'MenuTitle'); - } - } -} \ No newline at end of file + public function updateCMSFields(FieldList $fields) + { + if ($this->owner->StaticSiteContentSourceID && $this->owner->StaticSiteURL) { + $fields->addFieldToTab('Root.Main', new ReadonlyField('StaticSiteURL', 'Imported URL'), 'MenuTitle'); + } + } +} diff --git a/code/StaticSiteExternalContentAdminExtension.php b/code/StaticSiteExternalContentAdminExtension.php index b147aa4..891d7c8 100644 --- a/code/StaticSiteExternalContentAdminExtension.php +++ b/code/StaticSiteExternalContentAdminExtension.php @@ -1,35 +1,37 @@ getSource(); + public function crawlsite($request) + { + $selected = isset($request['ID']) ? $request['ID'] : 0; + if (!$selected) { + $messageType = 'bad'; + $message = _t('ExternalContent.NOITEMSELECTED', 'No item selected to crawl.'); + } else { + $source = ExternalContent::getDataObjectFor($selected); + if (!($source instanceof ExternalContentSource)) { + $source = $from->getSource(); + } - $messageType = 'good'; - $message = _t('ExternalContent.CONTENTMIGRATED', 'Crawling successful.'); + $messageType = 'good'; + $message = _t('ExternalContent.CONTENTMIGRATED', 'Crawling successful.'); - try { - $source->crawl(); - } catch(Exception $e) { - $messageType = 'bad'; - $message = "Error crawling: " . $e->getMessage(); - } + try { + $source->crawl(); + } catch (Exception $e) { + $messageType = 'bad'; + $message = "Error crawling: " . $e->getMessage(); + } + } - } + Session::set("FormInfo.Form_EditForm.formError.message", $message); + Session::set("FormInfo.Form_EditForm.formError.type", $messageType); - Session::set("FormInfo.Form_EditForm.formError.message", $message); - Session::set("FormInfo.Form_EditForm.formError.type", $messageType); - - return $this->owner->getResponseNegotiator()->respond($this->owner->getRequest()); - } -} \ No newline at end of file + return $this->owner->getResponseNegotiator()->respond($this->owner->getRequest()); + } +} diff --git a/code/StaticSiteImporter.php b/code/StaticSiteImporter.php index 7ba5493..ac9d2c8 100644 --- a/code/StaticSiteImporter.php +++ b/code/StaticSiteImporter.php @@ -1,12 +1,14 @@ contentTransforms['sitetree'] = new StaticSitePageTransformer(); - } +class StaticSiteImporter extends ExternalContentImporter +{ + public function __construct() + { + $this->contentTransforms['sitetree'] = new StaticSitePageTransformer(); + } - public function getExternalType($item) { - return "sitetree"; - } - -} \ No newline at end of file + public function getExternalType($item) + { + return "sitetree"; + } +} diff --git a/code/StaticSiteLinkRewriter.php b/code/StaticSiteLinkRewriter.php index c2acb2e..4e88254 100644 --- a/code/StaticSiteLinkRewriter.php +++ b/code/StaticSiteLinkRewriter.php @@ -5,65 +5,70 @@ /** * Helper class for rewriting links using phpQuery. */ -class StaticSiteLinkRewriter { +class StaticSiteLinkRewriter +{ - protected $tagMap = array( - 'a' => array('href'), - 'img' => array('src'), - ); + protected $tagMap = array( + 'a' => array('href'), + 'img' => array('src'), + ); - protected $callback; + protected $callback; - function __construct($callback) { - $this->callback = $callback; - } + public function __construct($callback) + { + $this->callback = $callback; + } - /** - * Set a map of tags & attributes to search for URls. - * - * Each key is a tagname, and each value is an array of attribute names. - */ - function setTagMap($tagMap) { - $this->tagMap = $tagMap; - } + /** + * Set a map of tags & attributes to search for URls. + * + * Each key is a tagname, and each value is an array of attribute names. + */ + public function setTagMap($tagMap) + { + $this->tagMap = $tagMap; + } - /** - * Return the tagmap - */ - function getTagMap($tagMap) { - $this->tagMap = $tagMap; - } + /** + * Return the tagmap + */ + public function getTagMap($tagMap) + { + $this->tagMap = $tagMap; + } - /** - * Rewrite URLs in a PHPQuery object. The content of the object will be modified. - * - * @param phpQuery $pq The content containing the links to rewrite - */ - function rewriteInPQ($pq) { - $callback = $this->callback; + /** + * Rewrite URLs in a PHPQuery object. The content of the object will be modified. + * + * @param phpQuery $pq The content containing the links to rewrite + */ + public function rewriteInPQ($pq) + { + $callback = $this->callback; - // Make URLs absolute - foreach($this->tagMap as $tag => $attributes) { - foreach($pq[$tag] as $tagObj) { - foreach($attributes as $attribute) { - if($url = pq($tagObj)->attr($attribute)) { - $newURL = $callback($url); - pq($tagObj)->attr($attribute, $newURL); - } - } - } - } - } + // Make URLs absolute + foreach ($this->tagMap as $tag => $attributes) { + foreach ($pq[$tag] as $tagObj) { + foreach ($attributes as $attribute) { + if ($url = pq($tagObj)->attr($attribute)) { + $newURL = $callback($url); + pq($tagObj)->attr($attribute, $newURL); + } + } + } + } + } - /** - * Rewrite URLs in the given content snippet. Returns the updated content. - * - * @param phpQuery $pq The content containing the links to rewrite - */ - function rewriteInContent($content) { - $pq = phpQuery::newDocument($content); - $this->rewriteInPQ($pq); - return $pq->html(); - } - -} \ No newline at end of file + /** + * Rewrite URLs in the given content snippet. Returns the updated content. + * + * @param phpQuery $pq The content containing the links to rewrite + */ + public function rewriteInContent($content) + { + $pq = phpQuery::newDocument($content); + $this->rewriteInPQ($pq); + return $pq->html(); + } +} diff --git a/code/StaticSitePageTransformer.php b/code/StaticSitePageTransformer.php index efc39bc..9494b1c 100644 --- a/code/StaticSitePageTransformer.php +++ b/code/StaticSitePageTransformer.php @@ -1,91 +1,94 @@ ID, $parentObject->Title"); - Debug::message($item->AbsoluteURL); - } - - // Sleep for 100ms to reduce load on the remote server - usleep(100*1000); - - // Extract content from the page - $contentFields = $this->getContentFieldsAndSelectors($item); - - // Default value for Title - if(empty($contentFields['Title'])) { - $contentFields['Title'] = array('content' => $item->Name); - } - - // Default value for URL segment - if(empty($contentFields['URLSegment'])) { - $urlSegment = str_replace('/','', $item->Name); - $urlSegment = preg_replace('/\.[^.]*$/','',$urlSegment); - $urlSegment = str_replace('.','-', $item->Name); - $contentFields['URLSegment'] = array('content' => $urlSegment); - } - - $schema = $item->getSource()->getSchemaForURL($item->AbsoluteURL); - - $pageType = $schema->DataType; - - if(!$pageType) { - throw new Exception('Pagetype for migration schema is empty!'); - } - - // Create a page with the appropriate fields - $page = new $pageType(array()); - $existingPage = SiteTree::get_by_link($item->getExternalId()); - - if($existingPage && $duplicateStrategy === 'Overwrite') { - if(get_class($existingPage) !== $pageType) { - $existingPage->ClassName = $pageType; - $existingPage->write(); - } - if($existingPage) { - $page = $existingPage; - } - } - - $page->StaticSiteContentSourceID = $item->getSource()->ID; - $page->StaticSiteURL = $item->AbsoluteURL; - - $page->ParentID = $parentObject ? $parentObject->ID : 0; - - foreach($contentFields as $k => $v) { - $page->$k = $v['content']; - } - - $page->write(); - - if(Director::is_cli()) { - Debug::message("#$page->Title"); - Debug::message("#$page->ID child of #$page->ID"); - } - - return new TransformResult($page, $item->stageChildren()); - } - - /** - * Get content from the remote host - * - * @param StaticSiteeContentItem $item The item to extract - * @return array A map of field name => array('selector' => selector, 'content' => field content) - */ - public function getContentFieldsAndSelectors($item) { - // Get the import rules from the content source - $importSchema = $item->getSource()->getSchemaForURL($item->AbsoluteURL); - if(!$importSchema) { - return null; - throw new LogicException("Couldn't find an import schema for $item->AbsoluteURL"); - } - $importRules = $importSchema->getImportRules(); - - // Extract from the remote page based on those rules - $contentExtractor = new StaticSiteContentExtractor($item->AbsoluteURL); - - return $contentExtractor->extractMapAndSelectors($importRules); - } -} \ No newline at end of file +class StaticSitePageTransformer implements ExternalContentTransformer +{ + + public function transform($item, $parentObject, $duplicateStrategy) + { + if (Director::is_cli()) { + Debug::message("Parent: #$parentObject->ID, $parentObject->Title"); + Debug::message($item->AbsoluteURL); + } + + // Sleep for 100ms to reduce load on the remote server + usleep(100*1000); + + // Extract content from the page + $contentFields = $this->getContentFieldsAndSelectors($item); + + // Default value for Title + if (empty($contentFields['Title'])) { + $contentFields['Title'] = array('content' => $item->Name); + } + + // Default value for URL segment + if (empty($contentFields['URLSegment'])) { + $urlSegment = str_replace('/', '', $item->Name); + $urlSegment = preg_replace('/\.[^.]*$/', '', $urlSegment); + $urlSegment = str_replace('.', '-', $item->Name); + $contentFields['URLSegment'] = array('content' => $urlSegment); + } + + $schema = $item->getSource()->getSchemaForURL($item->AbsoluteURL); + + $pageType = $schema->DataType; + + if (!$pageType) { + throw new Exception('Pagetype for migration schema is empty!'); + } + + // Create a page with the appropriate fields + $page = new $pageType(array()); + $existingPage = SiteTree::get_by_link($item->getExternalId()); + + if ($existingPage && $duplicateStrategy === 'Overwrite') { + if (get_class($existingPage) !== $pageType) { + $existingPage->ClassName = $pageType; + $existingPage->write(); + } + if ($existingPage) { + $page = $existingPage; + } + } + + $page->StaticSiteContentSourceID = $item->getSource()->ID; + $page->StaticSiteURL = $item->AbsoluteURL; + + $page->ParentID = $parentObject ? $parentObject->ID : 0; + + foreach ($contentFields as $k => $v) { + $page->$k = $v['content']; + } + + $page->write(); + + if (Director::is_cli()) { + Debug::message("#$page->Title"); + Debug::message("#$page->ID child of #$page->ID"); + } + + return new TransformResult($page, $item->stageChildren()); + } + + /** + * Get content from the remote host + * + * @param StaticSiteeContentItem $item The item to extract + * @return array A map of field name => array('selector' => selector, 'content' => field content) + */ + public function getContentFieldsAndSelectors($item) + { + // Get the import rules from the content source + $importSchema = $item->getSource()->getSchemaForURL($item->AbsoluteURL); + if (!$importSchema) { + return null; + throw new LogicException("Couldn't find an import schema for $item->AbsoluteURL"); + } + $importRules = $importSchema->getImportRules(); + + // Extract from the remote page based on those rules + $contentExtractor = new StaticSiteContentExtractor($item->AbsoluteURL); + + return $contentExtractor->extractMapAndSelectors($importRules); + } +} diff --git a/code/StaticSiteUrlList.php b/code/StaticSiteUrlList.php index 69c0780..6e40fff 100644 --- a/code/StaticSiteUrlList.php +++ b/code/StaticSiteUrlList.php @@ -7,532 +7,606 @@ * * Makes use of PHPCrawl to prepare a list of URLs on the site */ -class StaticSiteUrlList { - protected $baseURL, $cacheDir; - - /** - * Two element array: contains keys 'inferred' and 'regular': - * - 'regular' is an array mapping raw URLs to processed URLs - * - 'inferred' is an array of inferred URLs - */ - protected $urls = null; - - protected $autoCrawl = false; - - protected $urlProcessor = null; - - protected $extraCrawlURLs = null; - - /** - * A list of regular expression patterns to exclude from scraping - * - * @var array - */ - protected $excludePatterns = array(); - - /** - * Create a new URL List - * @param string $baseURL The Base URL to find links on - * @param string $cacheDir The local path to cache data into - */ - function __construct($baseURL, $cacheDir) { - // baseURL mus not have a trailing slash - if(substr($baseURL,-1) == "/") $baseURL = substr($baseURL,0,-1); - // cacheDir must have a trailing slash - if(substr($cacheDir,-1) != "/") $cacheDir .= "/"; - - $this->baseURL = $baseURL; - $this->cacheDir = $cacheDir; - } - - /** - * Set a URL processor for this URL List. - * - * URL processors process the URLs before the site heirarchy and inferred meta-data are generated. - * These can be used to tranform URLs from CMSes that don't provide a natural heirarchy into something - * more useful. - * - * See {@link StaticSiteMOSSURLProcessor} for an example. - * - * @param StaticSiteUrlProcessor $urlProcessor [description] - */ - function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor) { - $this->urlProcessor = $urlProcessor; - } - - /** - * Define additional crawl URLs as an array - * Each of these URLs will be crawled in addition the base URL. - * This can be helpful if pages are getting missed by the crawl - */ - function setExtraCrawlURls($extraCrawlURLs) { - $this->extraCrawlURLs = $extraCrawlURLs; - } - - /** - * Return the additional crawl URLs as an array - */ - function getExtraCrawlURLs() { - return $this->extraCrawlURLs; - } - - /** - * Set an array of regular expression patterns that should be excluded from - * being added to the url list - * - * @param array $excludePatterns - */ - public function setExcludePatterns(array $excludePatterns) { - $this->excludePatterns = $excludePatterns; - } - - /** - * Get an array of regular expression patterns that should not be added to - * the url list - * - * @return array - */ - public function getExcludePatterns() { - return $this->excludePatterns; - } - - /** - * - * Set whether the crawl should be triggered on demand. - * @param [type] $autoCrawl [description] - */ - public function setAutoCrawl($autoCrawl) { - $this->autoCrawl = $autoCrawl; - } - - /** - * Returns the status of the spidering: "Complete", "Partial", or "Not started" - * @return [type] [description] - */ - public function getSpiderStatus() { - if(file_exists($this->cacheDir . 'urls')) { - if(file_exists($this->cacheDir . 'crawlerid')) return "Partial"; - else return "Complete"; - - } else { - return "Not started"; - } - } - - /** - * Return the number of URLs crawled so far - */ - public function getNumURLs() { - if($this->urls) { - $urls = $this->urls; - // Don't rely on loadUrls() as it chokes on partially completed imports - } else if(file_exists($this->cacheDir . 'urls')) { - $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); - } else { - return null; - } - - return sizeof(array_unique($urls['regular'])) + sizeof($urls['inferred']); - } - - /** - * Return the raw URLs as an array - * @return array - */ - public function getRawURLs() { - if($urls = $this->getProcessedURLs()) { - return array_keys($urls); - } - } - - /** - * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values - * @return array - */ - public function getProcessedURLs() { - if($this->hasCrawled() || $this->autoCrawl) { - if($this->urls === null) $this->loadUrls(); - return array_merge( - $this->urls['regular'], - $this->urls['inferred'] ? array_combine($this->urls['inferred'], $this->urls['inferred']) : array() - ); - } - } - - public function hasCrawled() { - // There are URLs and we're not in the middle of a crawl - return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid'); - } - - /** - * Load the URLs, either by crawling, or by fetching from cache - * @return void - */ - public function loadUrls() { - if($this->hasCrawled()) { - $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); - // Clear out obsolete format - if(!isset($this->urls['regular']) || !isset($this->urls['inferred'])) { - $this->urls = array('regular' => array(), 'inferred' => array()); - } - - } else if($this->autoCrawl) { - $this->crawl(); - - } else { - throw new LogicException("Crawl hasn't been executed yet, and autoCrawl is set to false"); - } - } - - /** - * Re-execute the URL processor on all the fetched URLs - * @return void - */ - public function reprocessUrls() { - if($this->urls === null) $this->loadUrls(); - - // Clear out all inferred URLs; these will be added - $this->urls['inferred'] = array(); - - // Reprocess URLs, in case the processing has changed since the last crawl - foreach($this->urls['regular'] as $url => $oldProcessed) { - $processedURL = $this->generateProcessedURL($url); - $this->urls['regular'][$url] = $processedURL; - - // Trigger parent URL back-filling on new processed URL - $this->parentProcessedURL($processedURL); - } - - $this->saveURLs(); - } - - /** - * - * @param int $limit - * @param bool $verbose - * @return \StaticSiteCrawler - */ - public function crawl($limit=false, $verbose=false) { - increase_time_limit_to(3600); - - if(!is_dir($this->cacheDir)) mkdir($this->cacheDir); - - $crawler = new StaticSiteCrawler($this, $limit, $verbose); - $crawler->enableResumption(); - $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); - $crawler->setWorkingDirectory($this->cacheDir); - - // Allow for resuming an incomplete crawl - if(file_exists($this->cacheDir.'crawlerid')) { - // We should re-load the partial list of URLs, if relevant - // This should only happen when we are resuming a partial crawl - if(file_exists($this->cacheDir . 'urls')) { - $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); - } else { - $this->urls = array('regular' => array(), 'inferred' => array()); - } - - $crawlerID = file_get_contents($this->cacheDir.'crawlerid'); - $crawler->resume($crawlerID); - } else { - $crawlerID = $crawler->getCrawlerId(); - file_put_contents($this->cacheDir.'/crawlerid', $crawlerID); - $this->urls = array('regular' => array(), 'inferred' => array()); - } - - $crawler->setURL($this->baseURL); - $crawler->go(); - - unlink($this->cacheDir.'crawlerid'); - - ksort($this->urls['regular']); - ksort($this->urls['inferred']); - $this->saveURLs(); - return $crawler; - } - - /** - * Save the current list of URLs to disk - * @return [type] [description] - */ - function saveURLs() { - file_put_contents($this->cacheDir . 'urls', serialize($this->urls)); - } - - /** - * Add a URL to this list, given the absolute URL - * @param string $url The absolute URL - */ - function addAbsoluteURL($url) { - $simpifiedURL = $this->simplifyURL($url); - $simpifiedBase = $this->simplifyURL($this->baseURL); - - if(substr($simpifiedURL,0,strlen($simpifiedBase)) == $simpifiedBase) { - $relURL = substr($url, strlen($this->baseURL)); - } else { - throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); - } - - return $this->addURL($relURL); - } - - function addURL($url) { - if($this->urls === null) $this->loadUrls(); - - // Generate and save the processed URLs - $this->urls['regular'][$url] = $this->generateProcessedURL($url); - - // Trigger parent URL back-filling - $this->parentProcessedURL($this->urls['regular'][$url]); - } - - - /** - * Add an inferred URL to the list. - * - * Since the unprocessed URL isn't available, we use the processed URL in its place. This should be used with - * some caution. - * - * @param string $processedURL The processed URL to add. - */ - function addInferredURL($inferredURL) { - if($this->urls === null) $this->loadUrls(); - - // Generate and save the processed URLs - $this->urls['inferred'][$inferredURL] = $inferredURL; - - // Trigger parent URL back-filling - $this->parentProcessedURL($inferredURL); - } - - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - /** - * Return true if the given URL exists - * @param string $url The URL, either absolute, or relative starting with "/" - * @return boolean Does the URL exist - */ - function hasURL($url) { - if($this->urls === null) $this->loadUrls(); - - // Try and relativise an absolute URL - if($url[0] != '/') { - $simpifiedURL = $this->simplifyURL($url); - $simpifiedBase = $this->simplifyURL($this->baseURL); - - if(substr($simpifiedURL,0,strlen($simpifiedBase)) == $simpifiedBase) { - $url = substr($simpifiedURL, strlen($simpifiedBase)); - } else { - throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); - } - } - - return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); - } - - /** - * Simplify a URL. - * Ignores https/http differences and "www." / non differences. - * - * @param string $url - * @return string - */ - protected function simplifyURL($url) { - return preg_replace('#^https?://(www\.)?#i','http://www.', $url); - } - - /** - * Returns true if the given URL is in the list of processed URls - * - * @param string $processedURL The processed URL - * @return boolean True if it exists, false otherwise - */ - function hasProcessedURL($processedURL) { - if($this->urls === null) $this->loadUrls(); - - return in_array($processedURL, $this->urls['regular']) || in_array($processedURL, $this->urls['inferred']); - - } - - /** - * Return the processed URL that is the parent of the given one. - * - * Both input and output are processed URLs - * - * @param string $url A relative URL - * @return string [description] - */ - function parentProcessedURL($processedURL) { - if($processedURL == "/") return ""; - - // URL heirachy can be broken down by querystring or by URL - $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL,'/')); - - // Special case for children of the root - if($breakpoint == 0) return "/"; - - // Get parent URL - $parentProcessedURL = substr($processedURL,0,$breakpoint); - - // If an intermediary URL doesn't exist, create it - if(!$this->hasProcessedURL($parentProcessedURL)) $this->addInferredURL($parentProcessedURL); - - return $parentProcessedURL; - } - - /** - * Return the regular URL, given the processed one. - * - * Note that the URL processing isn't reversible, so this function works looks by iterating through all URLs. - * If the URL doesn't exist in the list, this function returns null. - * - * @param string $processedURL The URL after processing has been applied. - * @return string The original URL. - */ - function unprocessedURL($processedURL) { - if($url = array_search($processedURL, $this->urls['regular'])) { - return $url; - - } else if(in_array($processedURL, $this->urls['inferred'])) { - return $processedURL; - } else { - return null; - } - } - - /** - * Find the processed URL in the URL list - * @param [type] $url [description] - * @return [type] [description] - */ - function processedURL($url) { - if($this->urls === null) $this->loadUrls(); - - if(isset($this->urls['regular'][$url])) { - // Generate it if missing - if($this->urls['regular'][$url] === true) $this->urls['regular'][$url] = $this->generateProcessedURL($url); - return $this->urls['regular'][$url]; - - } elseif(in_array($url, $this->urls['inferred'])) { - return $url; - } - } - - /** - * Execute custom logic for processing URLs prior to heirachy generation. - * - * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. - * - * @param string $url The unprocessed URL - * @return string The processed URL - */ - function generateProcessedURL($url) { - if(!$url) throw new LogicException("Can't pass a blank URL to generateProcessedURL"); - if($this->urlProcessor) $url = $this->urlProcessor->processURL($url); - if(!$url) throw new LogicException(get_class($this->urlProcessor) . " returned a blank URL."); - return $url; - } - - /** - * Return the URLs that are a child of the given URL - * @param [type] $url [description] - * @return [type] [description] - */ - function getChildren($url) { - if($this->urls === null) $this->loadUrls(); - - $processedURL = $this->processedURL($url); - - // Subtly different regex if the URL ends in ? or / - if(preg_match('#[/?]$#',$processedURL)) $regEx = '#^'.preg_quote($processedURL,'#') . '[^/?]+$#'; - else $regEx = '#^'.preg_quote($processedURL,'#') . '[/?][^/?]+$#'; - - $children = array(); - foreach($this->urls['regular'] as $potentialChild => $potentialProcessedChild) { - if(preg_match($regEx, $potentialProcessedChild)) { - if(!isset($children[$potentialProcessedChild])) { - $children[$potentialProcessedChild] = $potentialChild; - } - } - } - foreach($this->urls['inferred'] as $potentialProcessedChild) { - if(preg_match($regEx, $potentialProcessedChild)) { - if(!isset($children[$potentialProcessedChild])) { - $children[$potentialProcessedChild] = $potentialProcessedChild; - } - } - } - - return array_values($children); - } +class StaticSiteUrlList +{ + protected $baseURL, $cacheDir; + + /** + * Two element array: contains keys 'inferred' and 'regular': + * - 'regular' is an array mapping raw URLs to processed URLs + * - 'inferred' is an array of inferred URLs + */ + protected $urls = null; + + protected $autoCrawl = false; + + protected $urlProcessor = null; + + protected $extraCrawlURLs = null; + + /** + * A list of regular expression patterns to exclude from scraping + * + * @var array + */ + protected $excludePatterns = array(); + + /** + * Create a new URL List + * @param string $baseURL The Base URL to find links on + * @param string $cacheDir The local path to cache data into + */ + public function __construct($baseURL, $cacheDir) + { + // baseURL mus not have a trailing slash + if (substr($baseURL, -1) == "/") { + $baseURL = substr($baseURL, 0, -1); + } + // cacheDir must have a trailing slash + if (substr($cacheDir, -1) != "/") { + $cacheDir .= "/"; + } + + $this->baseURL = $baseURL; + $this->cacheDir = $cacheDir; + } + + /** + * Set a URL processor for this URL List. + * + * URL processors process the URLs before the site heirarchy and inferred meta-data are generated. + * These can be used to tranform URLs from CMSes that don't provide a natural heirarchy into something + * more useful. + * + * See {@link StaticSiteMOSSURLProcessor} for an example. + * + * @param StaticSiteUrlProcessor $urlProcessor [description] + */ + public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor) + { + $this->urlProcessor = $urlProcessor; + } + + /** + * Define additional crawl URLs as an array + * Each of these URLs will be crawled in addition the base URL. + * This can be helpful if pages are getting missed by the crawl + */ + public function setExtraCrawlURls($extraCrawlURLs) + { + $this->extraCrawlURLs = $extraCrawlURLs; + } + + /** + * Return the additional crawl URLs as an array + */ + public function getExtraCrawlURLs() + { + return $this->extraCrawlURLs; + } + + /** + * Set an array of regular expression patterns that should be excluded from + * being added to the url list + * + * @param array $excludePatterns + */ + public function setExcludePatterns(array $excludePatterns) + { + $this->excludePatterns = $excludePatterns; + } + + /** + * Get an array of regular expression patterns that should not be added to + * the url list + * + * @return array + */ + public function getExcludePatterns() + { + return $this->excludePatterns; + } + + /** + * + * Set whether the crawl should be triggered on demand. + * @param [type] $autoCrawl [description] + */ + public function setAutoCrawl($autoCrawl) + { + $this->autoCrawl = $autoCrawl; + } + + /** + * Returns the status of the spidering: "Complete", "Partial", or "Not started" + * @return [type] [description] + */ + public function getSpiderStatus() + { + if (file_exists($this->cacheDir . 'urls')) { + if (file_exists($this->cacheDir . 'crawlerid')) { + return "Partial"; + } else { + return "Complete"; + } + } else { + return "Not started"; + } + } + + /** + * Return the number of URLs crawled so far + */ + public function getNumURLs() + { + if ($this->urls) { + $urls = $this->urls; + // Don't rely on loadUrls() as it chokes on partially completed imports + } elseif (file_exists($this->cacheDir . 'urls')) { + $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); + } else { + return null; + } + + return sizeof(array_unique($urls['regular'])) + sizeof($urls['inferred']); + } + + /** + * Return the raw URLs as an array + * @return array + */ + public function getRawURLs() + { + if ($urls = $this->getProcessedURLs()) { + return array_keys($urls); + } + } + + /** + * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values + * @return array + */ + public function getProcessedURLs() + { + if ($this->hasCrawled() || $this->autoCrawl) { + if ($this->urls === null) { + $this->loadUrls(); + } + return array_merge( + $this->urls['regular'], + $this->urls['inferred'] ? array_combine($this->urls['inferred'], $this->urls['inferred']) : array() + ); + } + } + + public function hasCrawled() + { + // There are URLs and we're not in the middle of a crawl + return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid'); + } + + /** + * Load the URLs, either by crawling, or by fetching from cache + * @return void + */ + public function loadUrls() + { + if ($this->hasCrawled()) { + $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); + // Clear out obsolete format + if (!isset($this->urls['regular']) || !isset($this->urls['inferred'])) { + $this->urls = array('regular' => array(), 'inferred' => array()); + } + } elseif ($this->autoCrawl) { + $this->crawl(); + } else { + throw new LogicException("Crawl hasn't been executed yet, and autoCrawl is set to false"); + } + } + + /** + * Re-execute the URL processor on all the fetched URLs + * @return void + */ + public function reprocessUrls() + { + if ($this->urls === null) { + $this->loadUrls(); + } + + // Clear out all inferred URLs; these will be added + $this->urls['inferred'] = array(); + + // Reprocess URLs, in case the processing has changed since the last crawl + foreach ($this->urls['regular'] as $url => $oldProcessed) { + $processedURL = $this->generateProcessedURL($url); + $this->urls['regular'][$url] = $processedURL; + + // Trigger parent URL back-filling on new processed URL + $this->parentProcessedURL($processedURL); + } + + $this->saveURLs(); + } + + /** + * + * @param int $limit + * @param bool $verbose + * @return \StaticSiteCrawler + */ + public function crawl($limit=false, $verbose=false) + { + increase_time_limit_to(3600); + + if (!is_dir($this->cacheDir)) { + mkdir($this->cacheDir); + } + + $crawler = new StaticSiteCrawler($this, $limit, $verbose); + $crawler->enableResumption(); + $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); + $crawler->setWorkingDirectory($this->cacheDir); + + // Allow for resuming an incomplete crawl + if (file_exists($this->cacheDir.'crawlerid')) { + // We should re-load the partial list of URLs, if relevant + // This should only happen when we are resuming a partial crawl + if (file_exists($this->cacheDir . 'urls')) { + $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); + } else { + $this->urls = array('regular' => array(), 'inferred' => array()); + } + + $crawlerID = file_get_contents($this->cacheDir.'crawlerid'); + $crawler->resume($crawlerID); + } else { + $crawlerID = $crawler->getCrawlerId(); + file_put_contents($this->cacheDir.'/crawlerid', $crawlerID); + $this->urls = array('regular' => array(), 'inferred' => array()); + } + + $crawler->setURL($this->baseURL); + $crawler->go(); + + unlink($this->cacheDir.'crawlerid'); + + ksort($this->urls['regular']); + ksort($this->urls['inferred']); + $this->saveURLs(); + return $crawler; + } + + /** + * Save the current list of URLs to disk + * @return [type] [description] + */ + public function saveURLs() + { + file_put_contents($this->cacheDir . 'urls', serialize($this->urls)); + } + /** + * Add a URL to this list, given the absolute URL + * @param string $url The absolute URL + */ + public function addAbsoluteURL($url) + { + $simpifiedURL = $this->simplifyURL($url); + $simpifiedBase = $this->simplifyURL($this->baseURL); + + if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { + $relURL = substr($url, strlen($this->baseURL)); + } else { + throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); + } + + return $this->addURL($relURL); + } + + public function addURL($url) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + // Generate and save the processed URLs + $this->urls['regular'][$url] = $this->generateProcessedURL($url); + + // Trigger parent URL back-filling + $this->parentProcessedURL($this->urls['regular'][$url]); + } + + + /** + * Add an inferred URL to the list. + * + * Since the unprocessed URL isn't available, we use the processed URL in its place. This should be used with + * some caution. + * + * @param string $processedURL The processed URL to add. + */ + public function addInferredURL($inferredURL) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + // Generate and save the processed URLs + $this->urls['inferred'][$inferredURL] = $inferredURL; + + // Trigger parent URL back-filling + $this->parentProcessedURL($inferredURL); + } + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Return true if the given URL exists + * @param string $url The URL, either absolute, or relative starting with "/" + * @return boolean Does the URL exist + */ + public function hasURL($url) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + // Try and relativise an absolute URL + if ($url[0] != '/') { + $simpifiedURL = $this->simplifyURL($url); + $simpifiedBase = $this->simplifyURL($this->baseURL); + + if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { + $url = substr($simpifiedURL, strlen($simpifiedBase)); + } else { + throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); + } + } + + return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); + } + + /** + * Simplify a URL. + * Ignores https/http differences and "www." / non differences. + * + * @param string $url + * @return string + */ + protected function simplifyURL($url) + { + return preg_replace('#^https?://(www\.)?#i', 'http://www.', $url); + } + + /** + * Returns true if the given URL is in the list of processed URls + * + * @param string $processedURL The processed URL + * @return boolean True if it exists, false otherwise + */ + public function hasProcessedURL($processedURL) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + return in_array($processedURL, $this->urls['regular']) || in_array($processedURL, $this->urls['inferred']); + } + + /** + * Return the processed URL that is the parent of the given one. + * + * Both input and output are processed URLs + * + * @param string $url A relative URL + * @return string [description] + */ + public function parentProcessedURL($processedURL) + { + if ($processedURL == "/") { + return ""; + } + + // URL heirachy can be broken down by querystring or by URL + $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/')); + + // Special case for children of the root + if ($breakpoint == 0) { + return "/"; + } + + // Get parent URL + $parentProcessedURL = substr($processedURL, 0, $breakpoint); + + // If an intermediary URL doesn't exist, create it + if (!$this->hasProcessedURL($parentProcessedURL)) { + $this->addInferredURL($parentProcessedURL); + } + + return $parentProcessedURL; + } + + /** + * Return the regular URL, given the processed one. + * + * Note that the URL processing isn't reversible, so this function works looks by iterating through all URLs. + * If the URL doesn't exist in the list, this function returns null. + * + * @param string $processedURL The URL after processing has been applied. + * @return string The original URL. + */ + public function unprocessedURL($processedURL) + { + if ($url = array_search($processedURL, $this->urls['regular'])) { + return $url; + } elseif (in_array($processedURL, $this->urls['inferred'])) { + return $processedURL; + } else { + return null; + } + } + + /** + * Find the processed URL in the URL list + * @param [type] $url [description] + * @return [type] [description] + */ + public function processedURL($url) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + if (isset($this->urls['regular'][$url])) { + // Generate it if missing + if ($this->urls['regular'][$url] === true) { + $this->urls['regular'][$url] = $this->generateProcessedURL($url); + } + return $this->urls['regular'][$url]; + } elseif (in_array($url, $this->urls['inferred'])) { + return $url; + } + } + + /** + * Execute custom logic for processing URLs prior to heirachy generation. + * + * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. + * + * @param string $url The unprocessed URL + * @return string The processed URL + */ + public function generateProcessedURL($url) + { + if (!$url) { + throw new LogicException("Can't pass a blank URL to generateProcessedURL"); + } + if ($this->urlProcessor) { + $url = $this->urlProcessor->processURL($url); + } + if (!$url) { + throw new LogicException(get_class($this->urlProcessor) . " returned a blank URL."); + } + return $url; + } + + /** + * Return the URLs that are a child of the given URL + * @param [type] $url [description] + * @return [type] [description] + */ + public function getChildren($url) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + $processedURL = $this->processedURL($url); + + // Subtly different regex if the URL ends in ? or / + if (preg_match('#[/?]$#', $processedURL)) { + $regEx = '#^'.preg_quote($processedURL, '#') . '[^/?]+$#'; + } else { + $regEx = '#^'.preg_quote($processedURL, '#') . '[/?][^/?]+$#'; + } + + $children = array(); + foreach ($this->urls['regular'] as $potentialChild => $potentialProcessedChild) { + if (preg_match($regEx, $potentialProcessedChild)) { + if (!isset($children[$potentialProcessedChild])) { + $children[$potentialProcessedChild] = $potentialChild; + } + } + } + foreach ($this->urls['inferred'] as $potentialProcessedChild) { + if (preg_match($regEx, $potentialProcessedChild)) { + if (!isset($children[$potentialProcessedChild])) { + $children[$potentialProcessedChild] = $potentialProcessedChild; + } + } + } + + return array_values($children); + } } -class StaticSiteCrawler extends PHPCrawler { - protected $urlList; - - /** - * - * @var bool - */ - protected $verbose = false; - - function __construct(StaticSiteUrlList $urlList, $limit=false, $verbose=false) { - parent::__construct(); - $this->urlList = $urlList; - $this->verbose = $verbose; - if($limit) { - $this->setPageLimit($limit); - } - } - - function handleHeaderInfo(PHPCrawlerResponseHeader $header) { - // Don't parse 400/500 responses - if($header->http_status_code > 399) { - $message = $header->source_url . " - skipped as it's $header->http_status_code".PHP_EOL; - error_log($message, 3, '/tmp/urls'); - if($this->verbose) { - echo "[!] ".$message; - } - return -1; - } - } - - function handleDocumentInfo(PHPCrawlerDocumentInfo $info) { - // Ignore errors and redirects - if($info->http_status_code < 200) return; - if($info->http_status_code > 299) return; - - // Ignore non HTML - if(!preg_match('#/x?html#', $info->content_type)) return; - - $this->urlList->addAbsoluteURL($info->url); - if($this->verbose) { - echo "[+] ".$info->url.PHP_EOL; - } - $this->urlList->saveURLs(); - } - - protected function initCrawlerProcess() { - parent::initCrawlerProcess(); - - // Add additional URLs to crawl to the crawler's LinkCache - // NOTE: This is using an undocumented API - if($extraURLs = $this->urlList->getExtraCrawlURLs()) { - foreach($extraURLs as $extraURL) { - $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL)); - } - } - - // Prevent URLs that matches the exclude patterns to be fetched - if($excludePatterns = $this->urlList->getExcludePatterns()) { - foreach($excludePatterns as $pattern) { - $validRegExp = $this->addURLFilterRule('|'.str_replace('|', '\|', $pattern).'|'); - - if(!$validRegExp) { - throw new InvalidArgumentException('Exclude url pattern "'.$pattern.'" is not a valid regular expression.'); - } - } - } +class StaticSiteCrawler extends PHPCrawler +{ + protected $urlList; + + /** + * + * @var bool + */ + protected $verbose = false; + + public function __construct(StaticSiteUrlList $urlList, $limit=false, $verbose=false) + { + parent::__construct(); + $this->urlList = $urlList; + $this->verbose = $verbose; + if ($limit) { + $this->setPageLimit($limit); + } } -} \ No newline at end of file + + public function handleHeaderInfo(PHPCrawlerResponseHeader $header) + { + // Don't parse 400/500 responses + if ($header->http_status_code > 399) { + $message = $header->source_url . " - skipped as it's $header->http_status_code".PHP_EOL; + error_log($message, 3, '/tmp/urls'); + if ($this->verbose) { + echo "[!] ".$message; + } + return -1; + } + } + + public function handleDocumentInfo(PHPCrawlerDocumentInfo $info) + { + // Ignore errors and redirects + if ($info->http_status_code < 200) { + return; + } + if ($info->http_status_code > 299) { + return; + } + + // Ignore non HTML + if (!preg_match('#/x?html#', $info->content_type)) { + return; + } + + $this->urlList->addAbsoluteURL($info->url); + if ($this->verbose) { + echo "[+] ".$info->url.PHP_EOL; + } + $this->urlList->saveURLs(); + } + + protected function initCrawlerProcess() + { + parent::initCrawlerProcess(); + + // Add additional URLs to crawl to the crawler's LinkCache + // NOTE: This is using an undocumented API + if ($extraURLs = $this->urlList->getExtraCrawlURLs()) { + foreach ($extraURLs as $extraURL) { + $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL)); + } + } + + // Prevent URLs that matches the exclude patterns to be fetched + if ($excludePatterns = $this->urlList->getExcludePatterns()) { + foreach ($excludePatterns as $pattern) { + $validRegExp = $this->addURLFilterRule('|'.str_replace('|', '\|', $pattern).'|'); + + if (!$validRegExp) { + throw new InvalidArgumentException('Exclude url pattern "'.$pattern.'" is not a valid regular expression.'); + } + } + } + } +} diff --git a/code/StaticSiteUrlProcessor.php b/code/StaticSiteUrlProcessor.php index f762bf1..4b443f1 100644 --- a/code/StaticSiteUrlProcessor.php +++ b/code/StaticSiteUrlProcessor.php @@ -13,75 +13,88 @@ * * More sophisticated processing might be done to facilitate importing of less */ -interface StaticSiteUrlProcessor { +interface StaticSiteUrlProcessor +{ - /** - * Return a name for the style of URLs to be processed. - * - * This name will be shown in the CMS when users are configuring the content import. - * - * @return string The name, in plaintext (no HTML) - */ - function getName(); + /** + * Return a name for the style of URLs to be processed. + * + * This name will be shown in the CMS when users are configuring the content import. + * + * @return string The name, in plaintext (no HTML) + */ + public function getName(); - /** - * Return an explanation of what processing is done. - * - * This explanation will be shown in the CMS when users are configuring the content import. - * - * @return string The description, in plaintext (no HTML) - */ - function getDescription(); + /** + * Return an explanation of what processing is done. + * + * This explanation will be shown in the CMS when users are configuring the content import. + * + * @return string The description, in plaintext (no HTML) + */ + public function getDescription(); - /** - * Return a description for this processor, to be shown in the CMS. - * @param string $url The unprocessed URL - * @return string The name - */ - function processURL($url); + /** + * Return a description for this processor, to be shown in the CMS. + * @param string $url The unprocessed URL + * @return string The name + */ + public function processURL($url); } /** * Processor for MOSS URLs */ -class StaticSiteURLProcessor_DropExtensions implements StaticSiteUrlProcessor { - function getName() { - return "Simple clean-up (recommended)"; - } +class StaticSiteURLProcessor_DropExtensions implements StaticSiteUrlProcessor +{ + public function getName() + { + return "Simple clean-up (recommended)"; + } - function getDescription() { - return "Drop file extensions and trailing slashes on URLs but otherwise leave them the same"; - } + public function getDescription() + { + return "Drop file extensions and trailing slashes on URLs but otherwise leave them the same"; + } - function processURL($url) { - if(preg_match('/^([^?]*)\?(.*)$/', $url, $matches)) { - $url = $matches[1]; - $qs = $matches[2]; - if($url != '/') $url = preg_replace('#/$#','',$url); - $url = preg_replace('#\.[^.]*$#','',$url); - return "$url?$qs"; - } else { - if($url != '/') $url = preg_replace('#/$#','',$url); - $url = preg_replace('#\.[^.]*$#','',$url); - return $url; - } - } + public function processURL($url) + { + if (preg_match('/^([^?]*)\?(.*)$/', $url, $matches)) { + $url = $matches[1]; + $qs = $matches[2]; + if ($url != '/') { + $url = preg_replace('#/$#', '', $url); + } + $url = preg_replace('#\.[^.]*$#', '', $url); + return "$url?$qs"; + } else { + if ($url != '/') { + $url = preg_replace('#/$#', '', $url); + } + $url = preg_replace('#\.[^.]*$#', '', $url); + return $url; + } + } } /** * Processor for MOSS URLs */ -class StaticSiteMOSSURLProcessor extends StaticSiteURLProcessor_DropExtensions implements StaticSiteUrlProcessor { - function getName() { - return "MOSS-style URLs"; - } +class StaticSiteMOSSURLProcessor extends StaticSiteURLProcessor_DropExtensions implements StaticSiteUrlProcessor +{ + public function getName() + { + return "MOSS-style URLs"; + } - function getDescription() { - return "Remove '/Pages/' from the URL, and drop extensions"; - } + public function getDescription() + { + return "Remove '/Pages/' from the URL, and drop extensions"; + } - function processURL($url) { - $url = str_ireplace('/Pages/','/',$url); - return parent::processURL($url); - } + public function processURL($url) + { + $url = str_ireplace('/Pages/', '/', $url); + return parent::processURL($url); + } } diff --git a/code/tasks/ExternalContentImportContentTask.php b/code/tasks/ExternalContentImportContentTask.php index 3987544..5e06c3d 100644 --- a/code/tasks/ExternalContentImportContentTask.php +++ b/code/tasks/ExternalContentImportContentTask.php @@ -3,35 +3,37 @@ /** * External content - run import as a build task, importing content into a new container */ -class ExternalContentImportContentTask extends BuildTask { +class ExternalContentImportContentTask extends BuildTask +{ - function run($request) { - $id = $request->getVar('ID'); - if((!is_numeric($id) && !preg_match('/^[0-9]+_[0-9]+$/', $id)) || !$id) { - echo "

Specify ?ID=(number) or ?ID=(ID)_(Code)

\n"; - return; - } + public function run($request) + { + $id = $request->getVar('ID'); + if ((!is_numeric($id) && !preg_match('/^[0-9]+_[0-9]+$/', $id)) || !$id) { + echo "

Specify ?ID=(number) or ?ID=(ID)_(Code)

\n"; + return; + } - $includeSelected = false; - $includeChildren = true; - $duplicates = 'Duplicate'; - $selected = $id; + $includeSelected = false; + $includeChildren = true; + $duplicates = 'Duplicate'; + $selected = $id; - $target = new Page; - $target->Title = "Import on " . date('Y-m-d H:i:s'); - $target->write(); - $targetType = 'SiteTree'; + $target = new Page; + $target->Title = "Import on " . date('Y-m-d H:i:s'); + $target->write(); + $targetType = 'SiteTree'; - $from = ExternalContent::getDataObjectFor($selected); - if ($from instanceof ExternalContentSource) { - $selected = false; - } + $from = ExternalContent::getDataObjectFor($selected); + if ($from instanceof ExternalContentSource) { + $selected = false; + } - $importer = null; - $importer = $from->getContentImporter($targetType); + $importer = null; + $importer = $from->getContentImporter($targetType); - if ($importer) { - $importer->import($from, $target, $includeSelected, $includeChildren, $duplicates); - } - } + if ($importer) { + $importer->import($from, $target, $includeSelected, $includeChildren, $duplicates); + } + } } diff --git a/code/tasks/StaticSiteCrawlURLsTask.php b/code/tasks/StaticSiteCrawlURLsTask.php index 1366102..eeceb69 100644 --- a/code/tasks/StaticSiteCrawlURLsTask.php +++ b/code/tasks/StaticSiteCrawlURLsTask.php @@ -4,17 +4,18 @@ * StaticSiteCrawlURLs * */ -class StaticSiteCrawlURLsTask extends BuildTask { - - function run($request) { - $id = $request->getVar('ID'); - if(!is_numeric($id) || !$id) { - echo "

Specify ?ID=(number)

"; - return; - } - // Find all pages - $contentSource = StaticSiteContentSource::get()->byID($id); - $contentSource->urllist()->crawl(false, true); - } +class StaticSiteCrawlURLsTask extends BuildTask +{ + public function run($request) + { + $id = $request->getVar('ID'); + if (!is_numeric($id) || !$id) { + echo "

Specify ?ID=(number)

"; + return; + } + // Find all pages + $contentSource = StaticSiteContentSource::get()->byID($id); + $contentSource->urllist()->crawl(false, true); + } } diff --git a/code/tasks/StaticSiteRewriteLinksTask.php b/code/tasks/StaticSiteRewriteLinksTask.php index c91c902..1089fce 100644 --- a/code/tasks/StaticSiteRewriteLinksTask.php +++ b/code/tasks/StaticSiteRewriteLinksTask.php @@ -3,70 +3,71 @@ /** * Rewrite all links in content imported via staticsiteimporter */ -class StaticSiteRewriteLinksTask extends BuildTask { - - function run($request) { - $id = $request->getVar('ID'); - if(!is_numeric($id) || !$id) { - echo "

Specify ?ID=(number)

"; - return; - } +class StaticSiteRewriteLinksTask extends BuildTask +{ + + public function run($request) + { + $id = $request->getVar('ID'); + if (!is_numeric($id) || !$id) { + echo "

Specify ?ID=(number)

"; + return; + } - // Find all pages - $contentSource = StaticSiteContentSource::get()->byID($id); - $pages = $contentSource->Pages(); + // Find all pages + $contentSource = StaticSiteContentSource::get()->byID($id); + $pages = $contentSource->Pages(); - echo "

Looking through " . $pages->Count() . " pages

\n"; + echo "

Looking through " . $pages->Count() . " pages

\n"; - // Set up rewriter - $pageLookup = $pages->map('StaticSiteURL', 'ID'); - $baseURL = $contentSource->BaseUrl; + // Set up rewriter + $pageLookup = $pages->map('StaticSiteURL', 'ID'); + $baseURL = $contentSource->BaseUrl; - $rewriter = new StaticSiteLinkRewriter(function($url) use($pageLookup, $baseURL) { - $fragment = ""; - if(strpos($url,'#') !== false) { - list($url,$fragment) = explode('#', $url, 2); - $fragment = '#'.$fragment; - } + $rewriter = new StaticSiteLinkRewriter(function ($url) use ($pageLookup, $baseURL) { + $fragment = ""; + if (strpos($url, '#') !== false) { + list($url, $fragment) = explode('#', $url, 2); + $fragment = '#'.$fragment; + } - if($pageLookup[$url]) { - return '[sitetree_link,id='.$pageLookup[$url] .']' . $fragment; - - } else { - if(substr($url,0,strlen($baseURL)) == $baseURL) { - echo "

WARNING: $url couldn't be rewritten.

\n"; - } - return $url . $fragment; - } - }); + if ($pageLookup[$url]) { + return '[sitetree_link,id='.$pageLookup[$url] .']' . $fragment; + } else { + if (substr($url, 0, strlen($baseURL)) == $baseURL) { + echo "

WARNING: $url couldn't be rewritten.

\n"; + } + return $url . $fragment; + } + }); - // Perform rewriting - $changedFields = 0; - foreach($pages as $page) { + // Perform rewriting + $changedFields = 0; + foreach ($pages as $page) { + $schema = $contentSource->getSchemaForURL($page->URLSegment); + // Get fields to process + $fields = array(); + foreach ($schema->ImportRules() as $rule) { + if (!$rule->PlainText) { + $fields[] = $rule->FieldName; + } + } + $fields = array_unique($fields); + - $schema = $contentSource->getSchemaForURL($page->URLSegment); - // Get fields to process - $fields = array(); - foreach($schema->ImportRules() as $rule) { - if(!$rule->PlainText) $fields[] = $rule->FieldName; - } - $fields = array_unique($fields); - + foreach ($fields as $field) { + $newContent = $rewriter->rewriteInContent($page->$field); + if ($newContent != $page->$field) { + $newContent = str_replace(array('%5B', '%5D'), array('[', ']'), $newContent); + $changedFields++; - foreach($fields as $field) { - $newContent = $rewriter->rewriteInContent($page->$field); - if($newContent != $page->$field) { - $newContent = str_replace(array('%5B','%5D'),array('[',']'),$newContent); - $changedFields++; + echo "

Changed $field on $page->Title (#$page->ID).

"; + $page->$field = $newContent; + } + } - echo "

Changed $field on $page->Title (#$page->ID).

"; - $page->$field = $newContent; - } - } - - $page->write(); - } - echo "

DONE. Amended $changedFields content fields.

".PHP_EOL; - - } -} \ No newline at end of file + $page->write(); + } + echo "

DONE. Amended $changedFields content fields.

".PHP_EOL; + } +}