diff --git a/code/StaticSiteContentExtractor.php b/code/StaticSiteContentExtractor.php
index 325a481..d6bd55c 100644
--- a/code/StaticSiteContentExtractor.php
+++ b/code/StaticSiteContentExtractor.php
@@ -9,323 +9,344 @@
* Given a set of fieldnames and CSS selectors corresponding to them, a map of content
* fields will be returned.
*/
-class StaticSiteContentExtractor extends Object {
-
- /**
- *
- * @var string
- */
- protected $url = null;
-
- /**
- *
- * @var string
- */
- protected $content = null;
-
- /**
- *
- * @var phpQueryObject
- */
- protected $phpQuery = null;
-
- /**
- * Set this by using the yml config system
- *
- * Example:
- *
- * StaticSiteContentExtractor:
+class StaticSiteContentExtractor extends Object
+{
+
+ /**
+ *
+ * @var string
+ */
+ protected $url = null;
+
+ /**
+ *
+ * @var string
+ */
+ protected $content = null;
+
+ /**
+ *
+ * @var phpQueryObject
+ */
+ protected $phpQuery = null;
+
+ /**
+ * Set this by using the yml config system
+ *
+ * Example:
+ *
+ *
+ * @var string
+ */
+ private static $log_file = null;
+
+ /**
+ * Create a StaticSiteContentExtractor for a single URL/.
+ *
+ * @param string $url The absolute URL to extract content from
+ */
+ public function __construct($url)
+ {
+ $this->url = $url;
+ }
+
+ /**
+ * Extract content for map of field => css-selector pairs
+ *
+ * @param array $selectorMap A map of field name => css-selector
+ * @return array A map of field name => array('selector' => selector, 'content' => field content)
+ */
+ public function extractMapAndSelectors($selectorMap)
+ {
+ if (!$this->phpQuery) {
+ $this->fetchContent();
+ }
+
+ $output = array();
+
+ foreach ($selectorMap as $fieldName => $extractionRules) {
+ if (!is_array($extractionRules)) {
+ $extractionRules = array($extractionRules);
+ }
+
+ foreach ($extractionRules as $extractionRule) {
+ if (!is_array($extractionRule)) {
+ $extractionRule = array('selector' => $extractionRule);
+ }
+
+ $content = $this->extractField($extractionRule['selector'], $extractionRule['attribute'], $extractionRule['outerhtml']);
+
+ if (!$content) {
+ continue;
+ }
+
+ $content = $this->excludeContent($extractionRule['excludeselectors'], $extractionRule['selector'], $content);
+
+ if (!$content) {
+ continue;
+ }
+
+ if (!empty($extractionRule['plaintext'])) {
+ $content = Convert::html2raw($content);
+ }
+
+ // We found a match, select that one and ignore any other selectors
+ $output[$fieldName] = $extractionRule;
+ $output[$fieldName]['content'] = $content;
+ $this->log("Value set for $fieldName");
+ break;
+ }
+ }
+ return $output;
+ }
+
+ /**
+ * Extract content for a single css selector
+ *
+ * @param string $cssSelector The selector for which to extract content.
+ * @param string $attribute If set, the value will be from this HTML attribute
+ * @param bool $outherHTML should we return the full HTML of the whole field
+ * @return string The content for that selector
+ */
+ public function extractField($cssSelector, $attribute = null, $outerHTML = false)
+ {
+ if (!$this->phpQuery) {
+ $this->fetchContent();
+ }
+
+ $elements = $this->phpQuery[$cssSelector];
+
+ // just return the inner HTML for this node
+ if (!$outerHTML || !$attribute) {
+ return trim($elements->html());
+ }
+
+ $result = '';
+ foreach ($elements as $element) {
+ // Get the full html for this element
+ if ($outerHTML) {
+ $result .= $this->getOuterHTML($element);
+ // Get the value of a attribute
+ } elseif ($attribute && trim($element->getAttribute($attribute))) {
+ $result .= ($element->getAttribute($attribute)).PHP_EOL;
+ }
+ }
+
+ return trim($result);
+ }
+
+ /**
+ * Strip away content from $content that matches one or many css selectors.
+ *
+ * @param array $excludeSelectors
+ * @param string $content
+ * @return string
+ */
+ protected function excludeContent($excludeSelectors, $parentSelector, $content)
+ {
+ if (!$excludeSelectors) {
+ return $content;
+ }
+
+ foreach ($excludeSelectors as $excludeSelector) {
+ if (!trim($excludeSelector)) {
+ continue;
+ }
+ $element = $this->phpQuery[$parentSelector.' '.$excludeSelector];
+ if ($element) {
+ $remove = $element->htmlOuter();
+ $content = str_replace($remove, '', $content);
+ $this->log(' - Excluded content from "'.$parentSelector.' '.$excludeSelector.'"');
+ }
+ }
+ return ($content);
+ }
+
+ /**
+ * Get the full HTML of the element and its childs
+ *
+ * @param DOMElement $element
+ * @return string
+ */
+ protected function getOuterHTML(DOMElement $element)
+ {
+ $doc = new DOMDocument();
+ $doc->formatOutput = false;
+ $doc->preserveWhiteSpace = true;
+ $doc->substituteEntities = false;
+ $doc->appendChild($doc->importNode($element, true));
+ return $doc->saveHTML();
+ }
+
+ /**
+ *
+ * @return string
+ */
+ public function getContent()
+ {
+ return $this->content;
+ }
+
+ /**
+ * Fetch the content and initialise $this->content and $this->phpQuery
+ *
+ * @return void
+ */
+ protected function fetchContent()
+ {
+ $this->log('Fetching ' . $this->url);
+
+ $response = $this->curlRequest($this->url, "GET");
+ $this->content = $response->getBody();
+ $this->phpQuery = phpQuery::newDocument($this->content);
+
+ //// Make the URLs all absolute
+
+ // Useful parts of the URL
+ if (!preg_match('#^[a-z]+:#i', $this->url, $matches)) {
+ throw new Exception('Bad URL: ' . $this->url);
+ }
+ $protocol = $matches[0];
+
+ if (!preg_match('#^[a-z]+://[^/]+#i', $this->url, $matches)) {
+ throw new Exception('Bad URL: ' . $this->url);
+ }
+ $server = $matches[0];
+
+ $base = (substr($this->url, -1) == '/') ? $this->url : dirname($this->url) . '/';
+
+ $this->log('Rewriting links in content');
+
+ $rewriter = new StaticSiteLinkRewriter(function ($url) use ($protocol, $server, $base) {
+ // Absolute
+ if (preg_match('#^[a-z]+://[^/]+#i', $url) || substr($url, 0, 7) == 'mailto:') {
+ return $url;
+ }
+
+ // Protocol relative
+ if (preg_match('#^//[^/]#i', $url)) {
+ return $protocol . $url;
+ }
+
+ // Server relative
+ if ($url[0] == "/") {
+ return $server . $url;
+ }
+
+ // Relative
+ $result = $base . $url;
+ while (strpos($result, '/../') !== false) {
+ $result = preg_replace('#[^/]+/+../+#i', '/', $result);
+ }
+ while (strpos($result, '/./') !== false) {
+ $result = str_replace('/./', '/', $result);
+ }
+ return $result;
+
+ });
+
+ #$rewriter->rewriteInPQ($this->phpQuery);
+ #echo($this->phpQuery->html());
+ }
+
+ /**
+ * Use cURL to request a URL, and return a SS_HTTPResponse object.
+ *
+ * @param string $url
+ * @param string $method
+ * @param string $data
+ * @param string $headers
+ * @param array $curlOptions
+ * @return \SS_HTTPResponse
+ */
+ protected function curlRequest($url, $method, $data = null, $headers = null, $curlOptions = array())
+ {
+ $ch = curl_init();
+ $timeout = 5;
+ $ssInfo = new SapphireInfo;
+ $useragent = 'SilverStripe/' . $ssInfo->version();
+
+ curl_setopt($ch, CURLOPT_URL, $url);
+ curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
+ curl_setopt($ch, CURLOPT_USERAGENT, $useragent);
+ curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
+ curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);
+ curl_setopt($ch, CURLOPT_HEADER, 1);
+
+ if ($headers) {
+ curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
+ }
+
+ // Add fields to POST and PUT requests
+ if ($method == 'POST') {
+ curl_setopt($ch, CURLOPT_POST, 1);
+ curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
+ } elseif ($method == 'PUT') {
+ $put = fopen("php://temp", 'r+');
+ fwrite($put, $data);
+ fseek($put, 0);
+
+ curl_setopt($ch, CURLOPT_PUT, 1);
+ curl_setopt($ch, CURLOPT_INFILE, $put);
+ curl_setopt($ch, CURLOPT_INFILESIZE, strlen($data));
+ }
+
+ // Follow redirects
+ curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
+
+ // Set any custom options passed to the request() function
+ curl_setopt_array($ch, $curlOptions);
+
+ // Run request
+ curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
+ $fullResponseBody = curl_exec($ch);
+ $curlError = curl_error($ch);
+
+ list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $fullResponseBody), 2);
+ if (preg_match("#^HTTP/1.1 100#", $responseHeaders)) {
+ list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r", "", $responseBody), 2);
+ }
+
+ $responseHeaders = explode("\n", trim($responseHeaders));
+ array_shift($responseHeaders);
+
+ $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+ curl_close($ch);
+
+ if ($curlError !== '' || $statusCode == 0) {
+ $statusCode = 500;
+ }
+
+ $response = new SS_HTTPResponse($responseBody, $statusCode);
+ foreach ($responseHeaders as $headerLine) {
+ if (strpos($headerLine, ":") !== false) {
+ list($headerName, $headerVal) = explode(":", $headerLine, 2);
+ $response->addHeader(trim($headerName), trim($headerVal));
+ }
+ }
+
+
+ return $response;
+ }
+
+ /**
+ * Log a message if the logging has been setup according to docs
+ *
+ * @param string $message
+ * @return void
+ */
+ protected function log($message)
+ {
+ $logFile = Config::inst()->get('StaticSiteContentExtractor', 'log_file');
+ if (!$logFile) {
+ return;
+ }
+
+ if (is_writable($logFile) || !file_exists($logFile) && is_writable(dirname($logFile))) {
+ error_log($message . "\n", 3, $logFile);
+ }
+ }
}
diff --git a/code/StaticSiteContentItem.php b/code/StaticSiteContentItem.php
index 56ffc0b..8bca9fa 100644
--- a/code/StaticSiteContentItem.php
+++ b/code/StaticSiteContentItem.php
@@ -1,69 +1,81 @@
externalId;
-
- $processedURL = $this->source->urlList()->processedURL($url);
- $parentURL = $this->source->urlList()->parentProcessedURL($processedURL);
-
- $subURL = substr($processedURL, strlen($parentURL));
- if($subURL != "/") $subURL = preg_replace('#(^/)|(/$)#','',$subURL);
-
- $this->Name = $subURL;
- $this->Title = $this->Name;
- $this->AbsoluteURL = preg_replace('#/$#','', $this->source->BaseUrl) . $this->externalId;
- $this->ProcessedURL = $processedURL;
- }
-
- public function stageChildren($showAll = false) {
- if(!$this->source->urlList()->hasCrawled()) return new ArrayList;
-
- $childrenURLs = $this->source->urlList()->getChildren($this->externalId);
-
- $children = new ArrayList;
- foreach($childrenURLs as $child) {
- $children->push($this->source->getObject($child));
- }
-
- return $children;
- }
-
- public function numChildren() {
- if(!$this->source->urlList()->hasCrawled()) return 0;
-
- return sizeof($this->source->urlList()->getChildren($this->externalId));
- }
-
- public function getType() {
- return "sitetree";
- }
-
- public function getCMSFields() {
- $fields = parent::getCMSFields();
-
- // Add the preview fields here, including rules used
- $t = new StaticSitePageTransformer;
-
- $urlField = new ReadonlyField("PreviewSourceURL", "Imported from",
- "AbsoluteURL\">" . Convert::raw2xml($this->AbsoluteURL) . "");
- $urlField->dontEscape = true;
-
- $fields->addFieldToTab("Root.Preview", $urlField);
-
- $content = $t->getContentFieldsAndSelectors($this);
- if(count($content) === 0) {
- return $fields;
- }
- foreach($content as $k => $v) {
- $readonlyField = new ReadonlyField("Preview$k", "$k
+ * StaticSiteContentExtractor:
* log_file: ../logs/import-log.txt
- *
- *
- * @var string
- */
- private static $log_file = null;
-
- /**
- * Create a StaticSiteContentExtractor for a single URL/.
- *
- * @param string $url The absolute URL to extract content from
- */
- public function __construct($url) {
- $this->url = $url;
- }
-
- /**
- * Extract content for map of field => css-selector pairs
- *
- * @param array $selectorMap A map of field name => css-selector
- * @return array A map of field name => array('selector' => selector, 'content' => field content)
- */
- public function extractMapAndSelectors($selectorMap) {
-
- if(!$this->phpQuery) {
- $this->fetchContent();
- }
-
- $output = array();
-
- foreach($selectorMap as $fieldName => $extractionRules) {
- if(!is_array($extractionRules)) {
- $extractionRules = array($extractionRules);
- }
-
- foreach($extractionRules as $extractionRule) {
- if(!is_array($extractionRule)) {
- $extractionRule = array('selector' => $extractionRule);
- }
-
- $content = $this->extractField($extractionRule['selector'], $extractionRule['attribute'], $extractionRule['outerhtml']);
-
- if(!$content) {
- continue;
- }
-
- $content = $this->excludeContent($extractionRule['excludeselectors'], $extractionRule['selector'], $content);
-
- if(!$content) {
- continue;
- }
-
- if(!empty($extractionRule['plaintext'])) {
- $content = Convert::html2raw($content);
- }
-
- // We found a match, select that one and ignore any other selectors
- $output[$fieldName] = $extractionRule;
- $output[$fieldName]['content'] = $content;
- $this->log("Value set for $fieldName");
- break;
- }
- }
- return $output;
- }
-
- /**
- * Extract content for a single css selector
- *
- * @param string $cssSelector The selector for which to extract content.
- * @param string $attribute If set, the value will be from this HTML attribute
- * @param bool $outherHTML should we return the full HTML of the whole field
- * @return string The content for that selector
- */
- public function extractField($cssSelector, $attribute = null, $outerHTML = false) {
- if(!$this->phpQuery) {
- $this->fetchContent();
- }
-
- $elements = $this->phpQuery[$cssSelector];
-
- // just return the inner HTML for this node
- if(!$outerHTML || !$attribute) {
- return trim($elements->html());
- }
-
- $result = '';
- foreach($elements as $element) {
- // Get the full html for this element
- if($outerHTML) {
- $result .= $this->getOuterHTML($element);
- // Get the value of a attribute
- } elseif($attribute && trim($element->getAttribute($attribute))) {
- $result .= ($element->getAttribute($attribute)).PHP_EOL;
- }
- }
-
- return trim($result);
- }
-
- /**
- * Strip away content from $content that matches one or many css selectors.
- *
- * @param array $excludeSelectors
- * @param string $content
- * @return string
- */
- protected function excludeContent($excludeSelectors, $parentSelector, $content) {
- if(!$excludeSelectors) {
- return $content;
- }
-
- foreach($excludeSelectors as $excludeSelector) {
- if(!trim($excludeSelector)) {
- continue;
- }
- $element = $this->phpQuery[$parentSelector.' '.$excludeSelector];
- if($element) {
- $remove = $element->htmlOuter();
- $content = str_replace($remove, '', $content);
- $this->log(' - Excluded content from "'.$parentSelector.' '.$excludeSelector.'"');
- }
- }
- return ($content);
- }
-
- /**
- * Get the full HTML of the element and its childs
- *
- * @param DOMElement $element
- * @return string
- */
- protected function getOuterHTML(DOMElement $element) {
- $doc = new DOMDocument();
- $doc->formatOutput = false;
- $doc->preserveWhiteSpace = true;
- $doc->substituteEntities = false;
- $doc->appendChild($doc->importNode($element, true));
- return $doc->saveHTML();
- }
-
- /**
- *
- * @return string
- */
- public function getContent() {
- return $this->content;
- }
-
- /**
- * Fetch the content and initialise $this->content and $this->phpQuery
- *
- * @return void
- */
- protected function fetchContent() {
- $this->log('Fetching ' . $this->url);
-
- $response = $this->curlRequest($this->url, "GET");
- $this->content = $response->getBody();
- $this->phpQuery = phpQuery::newDocument($this->content);
-
- //// Make the URLs all absolute
-
- // Useful parts of the URL
- if(!preg_match('#^[a-z]+:#i', $this->url, $matches)) throw new Exception('Bad URL: ' . $this->url);
- $protocol = $matches[0];
-
- if(!preg_match('#^[a-z]+://[^/]+#i', $this->url, $matches)) throw new Exception('Bad URL: ' . $this->url);
- $server = $matches[0];
-
- $base = (substr($this->url,-1) == '/') ? $this->url : dirname($this->url) . '/';
-
- $this->log('Rewriting links in content');
-
- $rewriter = new StaticSiteLinkRewriter(function($url) use($protocol, $server, $base) {
- // Absolute
- if(preg_match('#^[a-z]+://[^/]+#i', $url) || substr($url,0,7) == 'mailto:') return $url;
-
- // Protocol relative
- if(preg_match('#^//[^/]#i', $url)) return $protocol . $url;
-
- // Server relative
- if($url[0] == "/") return $server . $url;
-
- // Relative
- $result = $base . $url;
- while(strpos($result, '/../') !== false) {
- $result = preg_replace('#[^/]+/+../+#i','/', $result);
- }
- while(strpos($result, '/./') !== false) {
- $result = str_replace('/./','/', $result);
- }
- return $result;
-
- });
-
- #$rewriter->rewriteInPQ($this->phpQuery);
- #echo($this->phpQuery->html());
- }
-
- /**
- * Use cURL to request a URL, and return a SS_HTTPResponse object.
- *
- * @param string $url
- * @param string $method
- * @param string $data
- * @param string $headers
- * @param array $curlOptions
- * @return \SS_HTTPResponse
- */
- protected function curlRequest($url, $method, $data = null, $headers = null, $curlOptions = array()) {
- $ch = curl_init();
- $timeout = 5;
- $ssInfo = new SapphireInfo;
- $useragent = 'SilverStripe/' . $ssInfo->version();
-
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_USERAGENT, $useragent);
- curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
- curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);
- curl_setopt($ch, CURLOPT_HEADER, 1);
-
- if($headers) curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
-
- // Add fields to POST and PUT requests
- if($method == 'POST') {
- curl_setopt($ch, CURLOPT_POST, 1);
- curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
- } elseif($method == 'PUT') {
- $put = fopen("php://temp", 'r+');
- fwrite($put, $data);
- fseek($put, 0);
-
- curl_setopt($ch, CURLOPT_PUT, 1);
- curl_setopt($ch, CURLOPT_INFILE, $put);
- curl_setopt($ch, CURLOPT_INFILESIZE, strlen($data));
- }
-
- // Follow redirects
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
-
- // Set any custom options passed to the request() function
- curl_setopt_array($ch, $curlOptions);
-
- // Run request
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
- $fullResponseBody = curl_exec($ch);
- $curlError = curl_error($ch);
-
- list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r","",$fullResponseBody), 2);
- if(preg_match("#^HTTP/1.1 100#", $responseHeaders)) {
- list($responseHeaders, $responseBody) = explode("\n\n", str_replace("\r","",$responseBody), 2);
- }
-
- $responseHeaders = explode("\n", trim($responseHeaders));
- array_shift($responseHeaders);
-
- $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
- curl_close($ch);
-
- if($curlError !== '' || $statusCode == 0) {
- $statusCode = 500;
- }
-
- $response = new SS_HTTPResponse($responseBody, $statusCode);
- foreach($responseHeaders as $headerLine) {
- if(strpos($headerLine, ":") !== false) {
- list($headerName, $headerVal) = explode(":", $headerLine, 2);
- $response->addHeader(trim($headerName), trim($headerVal));
- }
- }
-
-
- return $response;
- }
-
- /**
- * Log a message if the logging has been setup according to docs
- *
- * @param string $message
- * @return void
- */
- protected function log($message) {
- $logFile = Config::inst()->get('StaticSiteContentExtractor','log_file');
- if(!$logFile) {
- return;
- }
-
- if(is_writable($logFile) || !file_exists($logFile) && is_writable(dirname($logFile))) {
- error_log($message . "\n", 3, $logFile);
- }
- }
+ *
\n" . $v['selector'] . "", $v['content']);
- $readonlyField->addExtraClass('readonly-click-toggle');
- $fields->addFieldToTab("Root.Preview", $readonlyField);
- }
-
- Requirements::javascript('staticsiteconnector/js/StaticSiteContentItem.js');
- Requirements::css('staticsiteconnector/css/StaticSiteContentItem.css');
-
- return $fields;
- }
-}
\ No newline at end of file
+class StaticSiteContentItem extends ExternalContentItem
+{
+ public function init()
+ {
+ $url = $this->externalId;
+
+ $processedURL = $this->source->urlList()->processedURL($url);
+ $parentURL = $this->source->urlList()->parentProcessedURL($processedURL);
+
+ $subURL = substr($processedURL, strlen($parentURL));
+ if ($subURL != "/") {
+ $subURL = preg_replace('#(^/)|(/$)#', '', $subURL);
+ }
+
+ $this->Name = $subURL;
+ $this->Title = $this->Name;
+ $this->AbsoluteURL = preg_replace('#/$#', '', $this->source->BaseUrl) . $this->externalId;
+ $this->ProcessedURL = $processedURL;
+ }
+
+ public function stageChildren($showAll = false)
+ {
+ if (!$this->source->urlList()->hasCrawled()) {
+ return new ArrayList;
+ }
+
+ $childrenURLs = $this->source->urlList()->getChildren($this->externalId);
+
+ $children = new ArrayList;
+ foreach ($childrenURLs as $child) {
+ $children->push($this->source->getObject($child));
+ }
+
+ return $children;
+ }
+
+ public function numChildren()
+ {
+ if (!$this->source->urlList()->hasCrawled()) {
+ return 0;
+ }
+
+ return sizeof($this->source->urlList()->getChildren($this->externalId));
+ }
+
+ public function getType()
+ {
+ return "sitetree";
+ }
+
+ public function getCMSFields()
+ {
+ $fields = parent::getCMSFields();
+
+ // Add the preview fields here, including rules used
+ $t = new StaticSitePageTransformer;
+
+ $urlField = new ReadonlyField("PreviewSourceURL", "Imported from",
+ "AbsoluteURL\">" . Convert::raw2xml($this->AbsoluteURL) . "");
+ $urlField->dontEscape = true;
+
+ $fields->addFieldToTab("Root.Preview", $urlField);
+
+ $content = $t->getContentFieldsAndSelectors($this);
+ if (count($content) === 0) {
+ return $fields;
+ }
+ foreach ($content as $k => $v) {
+ $readonlyField = new ReadonlyField("Preview$k", "$k
\n" . $v['selector'] . "", $v['content']);
+ $readonlyField->addExtraClass('readonly-click-toggle');
+ $fields->addFieldToTab("Root.Preview", $readonlyField);
+ }
+
+ Requirements::javascript('staticsiteconnector/js/StaticSiteContentItem.js');
+ Requirements::css('staticsiteconnector/css/StaticSiteContentItem.css');
+
+ return $fields;
+ }
+}
diff --git a/code/StaticSiteContentSource.php b/code/StaticSiteContentSource.php
index 412136d..8caf79f 100644
--- a/code/StaticSiteContentSource.php
+++ b/code/StaticSiteContentSource.php
@@ -1,377 +1,401 @@
'Varchar(255)',
- 'UrlProcessor' => 'Varchar(255)',
- 'ExtraCrawlUrls' => 'Text',
- 'UrlExcludePatterns' => 'Text',
- );
-
- public static $has_many = array(
- "Schemas" => "StaticSiteContentSource_ImportSchema",
- "Pages" => "SiteTree",
- );
-
-
- public function getCMSFields() {
- $fields = parent::getCMSFields();
-
- $importRules = $fields->dataFieldByName('Schemas');
- $importRules->getConfig()->removeComponentsByType('GridFieldAddExistingAutocompleter');
- $importRules->getConfig()->removeComponentsByType('GridFieldAddNewButton');
- $addNewButton = new GridFieldAddNewButton('after');
- $addNewButton->setButtonName("Add schema");
- $importRules->getConfig()->addComponent($addNewButton);
-
- $fields->removeFieldFromTab("Root", "Schemas");
- $fields->removeFieldFromTab("Root", "Pages");
- $fields->addFieldToTab("Root.Main", new LiteralField("", "
Each import rule will import content for a field" - . " by getting the results of a CSS selector. If more than one rule exists for a field, then they will be" - . " processed in the order they appear. The first rule that returns content will be the one used.
")); - $fields->addFieldToTab("Root.Main", $importRules); - - $processingOptions = array("" => "No pre-processing"); - foreach(ClassInfo::implementorsOf('StaticSiteUrlProcessor') as $processor) { - $processorObj = new $processor; - $processingOptions[$processor] = "" . Convert::raw2xml($processorObj->getName()) - . "Before importing this content, all URLs on the site must be crawled (like a search engine does). Click" - . " the button below to do so:
" - . "The following URLs have been identified:
" . $urlsAsUL) - ); - - - } - - $fields->dataFieldByName("ExtraCrawlUrls") - ->setDescription("Add URLs that are not reachable through content scraping, eg: '/about/team'. One per line") - ->setTitle('Additional URLs'); - $fields->dataFieldByName("UrlExcludePatterns") - ->setDescription("URLs that should be excluded (support regular expression). eg: '/about/.*'. One per URL") - ->setTitle('Excluded URLs'); - - return $fields; - } - - public function onAfterWrite() { - parent::onAfterWrite(); - - $urlList = $this->urlList(); - if($this->isChanged('UrlProcessor') && $urlList->hasCrawled()) { - if($processorClass = $this->UrlProcessor) { - $urlList->setUrlProcessor(new $processorClass); - } else { - $urlList->setUrlProcessor(null); - } - $urlList->reprocessUrls(); - } - } - - - public function urlList() { - if(!$this->urlList) { - $this->urlList = new StaticSiteUrlList($this->BaseUrl, "../assets/static-site-" . $this->ID); - if($processorClass = $this->UrlProcessor) { - $this->urlList->setUrlProcessor(new $processorClass); - } - if($this->ExtraCrawlUrls) { - $extraCrawlUrls = preg_split('/\s+/', trim($this->ExtraCrawlUrls)); - $this->urlList->setExtraCrawlUrls($extraCrawlUrls); - } - if($this->UrlExcludePatterns) { - $urlExcludePatterns = preg_split('/\s+/', trim($this->UrlExcludePatterns)); - $this->urlList->setExcludePatterns($urlExcludePatterns); - } - } - return $this->urlList; - } - - /** - * Crawl the target site - * @return StaticSiteCrawler - */ - public function crawl($limit=false, $verbose=false) { - if(!$this->BaseUrl) throw new LogicException("Can't crawl a site until Base URL is set."); - return $this->urlList()->crawl($limit, $verbose); - } - - public function getSchemaForURL($absoluteURL) { - // TODO: Return the right schema - return $this->Schemas()->First(); - } - - /** - * Returns a StaticSiteContentItem for the given URL. - * Relative URLs are used as the unique identifiers by this importer - * - * @param $id The URL, relative to BaseURL, starting with "/". - * @return DataObject - */ - public function getObject($id) { - - if($id[0] != "/") { - $id = $this->decodeId($id); - if($id[0] != "/") throw new InvalidArgumentException("\$id must start with /"); - } - - return new StaticSiteContentItem($this, $id); - } - - public function getRoot() { - return $this->getObject('/'); - } - - public function allowedImportTargets() { - return array('sitetree' => true); - } - - /** - * Return the root node - * @return ArrayList A list containing the root node - */ - public function stageChildren($showAll = false) { - if(!$this->urlList()->hasCrawled()) return new ArrayList; - - return new ArrayList(array( - $this->getObject("/") - )); - - } - - public function getContentImporter($target=null) { - return new StaticSiteImporter(); - } - - public function isValid() { - if(!(boolean)$this->BaseUrl) { - return false; - } - return true; - } - public function canImport($member = null) { - return $this->isValid(); - } - public function canCreate($member = null) { - return true; - } - +class StaticSiteContentSource extends ExternalContentSource +{ + + public static $db = array( + 'BaseUrl' => 'Varchar(255)', + 'UrlProcessor' => 'Varchar(255)', + 'ExtraCrawlUrls' => 'Text', + 'UrlExcludePatterns' => 'Text', + ); + + public static $has_many = array( + "Schemas" => "StaticSiteContentSource_ImportSchema", + "Pages" => "SiteTree", + ); + + + public function getCMSFields() + { + $fields = parent::getCMSFields(); + + $importRules = $fields->dataFieldByName('Schemas'); + $importRules->getConfig()->removeComponentsByType('GridFieldAddExistingAutocompleter'); + $importRules->getConfig()->removeComponentsByType('GridFieldAddNewButton'); + $addNewButton = new GridFieldAddNewButton('after'); + $addNewButton->setButtonName("Add schema"); + $importRules->getConfig()->addComponent($addNewButton); + + $fields->removeFieldFromTab("Root", "Schemas"); + $fields->removeFieldFromTab("Root", "Pages"); + $fields->addFieldToTab("Root.Main", new LiteralField("", "Each import rule will import content for a field" + . " by getting the results of a CSS selector. If more than one rule exists for a field, then they will be" + . " processed in the order they appear. The first rule that returns content will be the one used.
")); + $fields->addFieldToTab("Root.Main", $importRules); + + $processingOptions = array("" => "No pre-processing"); + foreach (ClassInfo::implementorsOf('StaticSiteUrlProcessor') as $processor) { + $processorObj = new $processor; + $processingOptions[$processor] = "" . Convert::raw2xml($processorObj->getName()) + . "Before importing this content, all URLs on the site must be crawled (like a search engine does). Click" + . " the button below to do so:
" + . "The following URLs have been identified:
" . $urlsAsUL) + ); + } + + $fields->dataFieldByName("ExtraCrawlUrls") + ->setDescription("Add URLs that are not reachable through content scraping, eg: '/about/team'. One per line") + ->setTitle('Additional URLs'); + $fields->dataFieldByName("UrlExcludePatterns") + ->setDescription("URLs that should be excluded (support regular expression). eg: '/about/.*'. One per URL") + ->setTitle('Excluded URLs'); + + return $fields; + } + + public function onAfterWrite() + { + parent::onAfterWrite(); + + $urlList = $this->urlList(); + if ($this->isChanged('UrlProcessor') && $urlList->hasCrawled()) { + if ($processorClass = $this->UrlProcessor) { + $urlList->setUrlProcessor(new $processorClass); + } else { + $urlList->setUrlProcessor(null); + } + $urlList->reprocessUrls(); + } + } + + + public function urlList() + { + if (!$this->urlList) { + $this->urlList = new StaticSiteUrlList($this->BaseUrl, "../assets/static-site-" . $this->ID); + if ($processorClass = $this->UrlProcessor) { + $this->urlList->setUrlProcessor(new $processorClass); + } + if ($this->ExtraCrawlUrls) { + $extraCrawlUrls = preg_split('/\s+/', trim($this->ExtraCrawlUrls)); + $this->urlList->setExtraCrawlUrls($extraCrawlUrls); + } + if ($this->UrlExcludePatterns) { + $urlExcludePatterns = preg_split('/\s+/', trim($this->UrlExcludePatterns)); + $this->urlList->setExcludePatterns($urlExcludePatterns); + } + } + return $this->urlList; + } + + /** + * Crawl the target site + * @return StaticSiteCrawler + */ + public function crawl($limit=false, $verbose=false) + { + if (!$this->BaseUrl) { + throw new LogicException("Can't crawl a site until Base URL is set."); + } + return $this->urlList()->crawl($limit, $verbose); + } + + public function getSchemaForURL($absoluteURL) + { + // TODO: Return the right schema + return $this->Schemas()->First(); + } + + /** + * Returns a StaticSiteContentItem for the given URL. + * Relative URLs are used as the unique identifiers by this importer + * + * @param $id The URL, relative to BaseURL, starting with "/". + * @return DataObject + */ + public function getObject($id) + { + if ($id[0] != "/") { + $id = $this->decodeId($id); + if ($id[0] != "/") { + throw new InvalidArgumentException("\$id must start with /"); + } + } + + return new StaticSiteContentItem($this, $id); + } + + public function getRoot() + { + return $this->getObject('/'); + } + + public function allowedImportTargets() + { + return array('sitetree' => true); + } + + /** + * Return the root node + * @return ArrayList A list containing the root node + */ + public function stageChildren($showAll = false) + { + if (!$this->urlList()->hasCrawled()) { + return new ArrayList; + } + + return new ArrayList(array( + $this->getObject("/") + )); + } + + public function getContentImporter($target=null) + { + return new StaticSiteImporter(); + } + + public function isValid() + { + if (!(boolean)$this->BaseUrl) { + return false; + } + return true; + } + public function canImport($member = null) + { + return $this->isValid(); + } + public function canCreate($member = null) + { + return true; + } } /** * A collection of ImportRules that apply to some or all of the pages being imported. */ -class StaticSiteContentSource_ImportSchema extends DataObject { - public static $db = array( - "DataType" => "Varchar", // classname - "Order" => "Int", - "AppliesTo" => "Varchar(255)", // regex - ); - public static $summary_fields = array( - "AppliesTo", - "DataType", - "Order", - ); - public static $field_labels = array( - "AppliesTo" => "URLs applied to", - "DataType" => "Data type", - "Order" => "Priority", - ); - - public static $default_sort = "Order"; - - public static $has_one = array( - "ContentSource" => "StaticSiteContentSource", - ); - - public static $has_many = array( - "ImportRules" => "StaticSiteContentSource_ImportRule", - ); - - public function getTitle() { - return $this->DataType.' ('.$this->AppliesTo.')'; - } - - /** - * - * @return FieldList - */ - public function getCMSFields() { - $fields = parent::getCMSFields(); - $fields->removeFieldFromTab('Root.Main', 'DataType'); - $fields->removeByName('ContentSourceID'); - $dataObjects = ClassInfo::subclassesFor('DataObject'); - array_shift($dataObjects); - natcasesort($dataObjects); - $fields->addFieldToTab('Root.Main', new DropdownField('DataType', 'DataType', $dataObjects)); - - $importRules = $fields->dataFieldByName('ImportRules'); - if($importRules) { - $importRules->getConfig()->removeComponentsByType('GridFieldAddExistingAutocompleter'); - $importRules->getConfig()->removeComponentsByType('GridFieldAddNewButton'); - $addNewButton = new GridFieldAddNewButton('after'); - $addNewButton->setButtonName("Add Rule"); - $importRules->getConfig()->addComponent($addNewButton); - - $fields->removeFieldFromTab('Root', 'ImportRules'); - $fields->addFieldToTab('Root.Main', $importRules); - } - - return $fields; - } - - public function requireDefaultRecords() { - foreach(StaticSiteContentSource::get() as $source) { - if(!$source->Schemas()->count()) { - Debug::message("Making a schema for $source->ID"); - $defaultSchema = new StaticSiteContentSource_ImportSchema; - $defaultSchema->Order = 1000000; - $defaultSchema->AppliesTo = ".*"; - $defaultSchema->DataType = "Page"; - $defaultSchema->ContentSourceID = $source->ID; - $defaultSchema->write(); - - - foreach(StaticSiteContentSource_ImportRule::get()->filter(array('SchemaID' => 0)) as $rule) { - $rule->SchemaID = $defaultSchema->ID; - $rule->write(); - } - } - } - } - - /** - * Return the import rules in a format suitable for configuring StaticSiteContentExtractor. - * - * @return array A map of field name => array(CSS selector, CSS selector, ...) - */ - public function getImportRules() { - $output = array(); - - foreach($this->ImportRules() as $rule) { - if(!isset($output[$rule->FieldName])) $output[$rule->FieldName] = array(); - $ruleArray = array( - 'selector' => $rule->CSSSelector, - 'attribute' => $rule->Attribute, - 'plaintext' => $rule->PlainText, - 'excludeselectors' => preg_split('/\s+/', trim($rule->ExcludeCSSSelector)), - 'outerhtml' => $rule->OuterHTML, - ); - $output[$rule->FieldName][] = $ruleArray; - } - - return $output; - } - +class StaticSiteContentSource_ImportSchema extends DataObject +{ + public static $db = array( + "DataType" => "Varchar", // classname + "Order" => "Int", + "AppliesTo" => "Varchar(255)", // regex + ); + public static $summary_fields = array( + "AppliesTo", + "DataType", + "Order", + ); + public static $field_labels = array( + "AppliesTo" => "URLs applied to", + "DataType" => "Data type", + "Order" => "Priority", + ); + + public static $default_sort = "Order"; + + public static $has_one = array( + "ContentSource" => "StaticSiteContentSource", + ); + + public static $has_many = array( + "ImportRules" => "StaticSiteContentSource_ImportRule", + ); + + public function getTitle() + { + return $this->DataType.' ('.$this->AppliesTo.')'; + } + + /** + * + * @return FieldList + */ + public function getCMSFields() + { + $fields = parent::getCMSFields(); + $fields->removeFieldFromTab('Root.Main', 'DataType'); + $fields->removeByName('ContentSourceID'); + $dataObjects = ClassInfo::subclassesFor('DataObject'); + array_shift($dataObjects); + natcasesort($dataObjects); + $fields->addFieldToTab('Root.Main', new DropdownField('DataType', 'DataType', $dataObjects)); + + $importRules = $fields->dataFieldByName('ImportRules'); + if ($importRules) { + $importRules->getConfig()->removeComponentsByType('GridFieldAddExistingAutocompleter'); + $importRules->getConfig()->removeComponentsByType('GridFieldAddNewButton'); + $addNewButton = new GridFieldAddNewButton('after'); + $addNewButton->setButtonName("Add Rule"); + $importRules->getConfig()->addComponent($addNewButton); + + $fields->removeFieldFromTab('Root', 'ImportRules'); + $fields->addFieldToTab('Root.Main', $importRules); + } + + return $fields; + } + + public function requireDefaultRecords() + { + foreach (StaticSiteContentSource::get() as $source) { + if (!$source->Schemas()->count()) { + Debug::message("Making a schema for $source->ID"); + $defaultSchema = new StaticSiteContentSource_ImportSchema; + $defaultSchema->Order = 1000000; + $defaultSchema->AppliesTo = ".*"; + $defaultSchema->DataType = "Page"; + $defaultSchema->ContentSourceID = $source->ID; + $defaultSchema->write(); + + + foreach (StaticSiteContentSource_ImportRule::get()->filter(array('SchemaID' => 0)) as $rule) { + $rule->SchemaID = $defaultSchema->ID; + $rule->write(); + } + } + } + } + + /** + * Return the import rules in a format suitable for configuring StaticSiteContentExtractor. + * + * @return array A map of field name => array(CSS selector, CSS selector, ...) + */ + public function getImportRules() + { + $output = array(); + + foreach ($this->ImportRules() as $rule) { + if (!isset($output[$rule->FieldName])) { + $output[$rule->FieldName] = array(); + } + $ruleArray = array( + 'selector' => $rule->CSSSelector, + 'attribute' => $rule->Attribute, + 'plaintext' => $rule->PlainText, + 'excludeselectors' => preg_split('/\s+/', trim($rule->ExcludeCSSSelector)), + 'outerhtml' => $rule->OuterHTML, + ); + $output[$rule->FieldName][] = $ruleArray; + } + + return $output; + } } /** * A single import rule that forms part of an ImportSchema */ -class StaticSiteContentSource_ImportRule extends DataObject { - public static $db = array( - "FieldName" => "Varchar", - "CSSSelector" => "Text", - "ExcludeCSSSelector" => "Text", - "Attribute" => "Varchar", - "PlainText" => "Boolean", - "OuterHTML" => "Boolean", - ); - - public static $summary_fields = array( - "FieldName", - "CSSSelector", - "Attribute", - "PlainText", - "OuterHTML", - ); - - public static $field_labels = array( - "FieldName" => "Field Name", - "CSSSelector" => "CSS Selector", - "Attribute" => "Element attribute", - "PlainText" => "Convert to plain text", - "OuterHTML" => "Use the outer HTML", - ); - - public static $has_one = array( - "Schema" => "StaticSiteContentSource_ImportSchema", - ); - - public function getTitle() { - return ($this->FieldName)?$this->FieldName:$this->ID; - } - - /** - * - * @return FieldList - */ - public function getCMSFields() { - $fields = parent::getCMSFields(); - - $dataType = $this->Schema()->DataType; - if($dataType) { - $fieldList = singleton($dataType)->inheritedDatabaseFields(); - $fieldList = array_combine(array_keys($fieldList),array_keys($fieldList)); - unset($fieldList->ParentID); - unset($fieldList->WorkflowDefinitionID); - unset($fieldList->Version); - - $fieldNameField = new DropdownField("FieldName", "Field Name", $fieldList); - $fieldNameField->setEmptyString("(choose)"); - $fields->insertBefore($fieldNameField, "CSSSelector"); - } else { - $fields->replaceField('FieldName', $fieldName = new ReadonlyField("FieldName", "Field Name")); - $fieldName->setDescription('Save this rule before being able to add a field name'); - } - - return $fields; - } -} \ No newline at end of file +class StaticSiteContentSource_ImportRule extends DataObject +{ + public static $db = array( + "FieldName" => "Varchar", + "CSSSelector" => "Text", + "ExcludeCSSSelector" => "Text", + "Attribute" => "Varchar", + "PlainText" => "Boolean", + "OuterHTML" => "Boolean", + ); + + public static $summary_fields = array( + "FieldName", + "CSSSelector", + "Attribute", + "PlainText", + "OuterHTML", + ); + + public static $field_labels = array( + "FieldName" => "Field Name", + "CSSSelector" => "CSS Selector", + "Attribute" => "Element attribute", + "PlainText" => "Convert to plain text", + "OuterHTML" => "Use the outer HTML", + ); + + public static $has_one = array( + "Schema" => "StaticSiteContentSource_ImportSchema", + ); + + public function getTitle() + { + return ($this->FieldName)?$this->FieldName:$this->ID; + } + + /** + * + * @return FieldList + */ + public function getCMSFields() + { + $fields = parent::getCMSFields(); + + $dataType = $this->Schema()->DataType; + if ($dataType) { + $fieldList = singleton($dataType)->inheritedDatabaseFields(); + $fieldList = array_combine(array_keys($fieldList), array_keys($fieldList)); + unset($fieldList->ParentID); + unset($fieldList->WorkflowDefinitionID); + unset($fieldList->Version); + + $fieldNameField = new DropdownField("FieldName", "Field Name", $fieldList); + $fieldNameField->setEmptyString("(choose)"); + $fields->insertBefore($fieldNameField, "CSSSelector"); + } else { + $fields->replaceField('FieldName', $fieldName = new ReadonlyField("FieldName", "Field Name")); + $fieldName->setDescription('Save this rule before being able to add a field name'); + } + + return $fields; + } +} diff --git a/code/StaticSiteDataExtension.php b/code/StaticSiteDataExtension.php index effd33e..8f78b59 100644 --- a/code/StaticSiteDataExtension.php +++ b/code/StaticSiteDataExtension.php @@ -1,16 +1,18 @@ "StaticSiteContentSource", - ); - static $db = array( - "StaticSiteURL" => "Varchar(255)", - ); +class StaticSiteDataExtension extends DataExtension +{ + public static $has_one = array( + "StaticSiteContentSource" => "StaticSiteContentSource", + ); + public static $db = array( + "StaticSiteURL" => "Varchar(255)", + ); - function updateCMSFields(FieldList $fields) { - if($this->owner->StaticSiteContentSourceID && $this->owner->StaticSiteURL) { - $fields->addFieldToTab('Root.Main', new ReadonlyField('StaticSiteURL', 'Imported URL'), 'MenuTitle'); - } - } -} \ No newline at end of file + public function updateCMSFields(FieldList $fields) + { + if ($this->owner->StaticSiteContentSourceID && $this->owner->StaticSiteURL) { + $fields->addFieldToTab('Root.Main', new ReadonlyField('StaticSiteURL', 'Imported URL'), 'MenuTitle'); + } + } +} diff --git a/code/StaticSiteExternalContentAdminExtension.php b/code/StaticSiteExternalContentAdminExtension.php index b147aa4..891d7c8 100644 --- a/code/StaticSiteExternalContentAdminExtension.php +++ b/code/StaticSiteExternalContentAdminExtension.php @@ -1,35 +1,37 @@ getSource(); + public function crawlsite($request) + { + $selected = isset($request['ID']) ? $request['ID'] : 0; + if (!$selected) { + $messageType = 'bad'; + $message = _t('ExternalContent.NOITEMSELECTED', 'No item selected to crawl.'); + } else { + $source = ExternalContent::getDataObjectFor($selected); + if (!($source instanceof ExternalContentSource)) { + $source = $from->getSource(); + } - $messageType = 'good'; - $message = _t('ExternalContent.CONTENTMIGRATED', 'Crawling successful.'); + $messageType = 'good'; + $message = _t('ExternalContent.CONTENTMIGRATED', 'Crawling successful.'); - try { - $source->crawl(); - } catch(Exception $e) { - $messageType = 'bad'; - $message = "Error crawling: " . $e->getMessage(); - } + try { + $source->crawl(); + } catch (Exception $e) { + $messageType = 'bad'; + $message = "Error crawling: " . $e->getMessage(); + } + } - } + Session::set("FormInfo.Form_EditForm.formError.message", $message); + Session::set("FormInfo.Form_EditForm.formError.type", $messageType); - Session::set("FormInfo.Form_EditForm.formError.message", $message); - Session::set("FormInfo.Form_EditForm.formError.type", $messageType); - - return $this->owner->getResponseNegotiator()->respond($this->owner->getRequest()); - } -} \ No newline at end of file + return $this->owner->getResponseNegotiator()->respond($this->owner->getRequest()); + } +} diff --git a/code/StaticSiteImporter.php b/code/StaticSiteImporter.php index 7ba5493..ac9d2c8 100644 --- a/code/StaticSiteImporter.php +++ b/code/StaticSiteImporter.php @@ -1,12 +1,14 @@ contentTransforms['sitetree'] = new StaticSitePageTransformer(); - } +class StaticSiteImporter extends ExternalContentImporter +{ + public function __construct() + { + $this->contentTransforms['sitetree'] = new StaticSitePageTransformer(); + } - public function getExternalType($item) { - return "sitetree"; - } - -} \ No newline at end of file + public function getExternalType($item) + { + return "sitetree"; + } +} diff --git a/code/StaticSiteLinkRewriter.php b/code/StaticSiteLinkRewriter.php index c2acb2e..4e88254 100644 --- a/code/StaticSiteLinkRewriter.php +++ b/code/StaticSiteLinkRewriter.php @@ -5,65 +5,70 @@ /** * Helper class for rewriting links using phpQuery. */ -class StaticSiteLinkRewriter { +class StaticSiteLinkRewriter +{ - protected $tagMap = array( - 'a' => array('href'), - 'img' => array('src'), - ); + protected $tagMap = array( + 'a' => array('href'), + 'img' => array('src'), + ); - protected $callback; + protected $callback; - function __construct($callback) { - $this->callback = $callback; - } + public function __construct($callback) + { + $this->callback = $callback; + } - /** - * Set a map of tags & attributes to search for URls. - * - * Each key is a tagname, and each value is an array of attribute names. - */ - function setTagMap($tagMap) { - $this->tagMap = $tagMap; - } + /** + * Set a map of tags & attributes to search for URls. + * + * Each key is a tagname, and each value is an array of attribute names. + */ + public function setTagMap($tagMap) + { + $this->tagMap = $tagMap; + } - /** - * Return the tagmap - */ - function getTagMap($tagMap) { - $this->tagMap = $tagMap; - } + /** + * Return the tagmap + */ + public function getTagMap($tagMap) + { + $this->tagMap = $tagMap; + } - /** - * Rewrite URLs in a PHPQuery object. The content of the object will be modified. - * - * @param phpQuery $pq The content containing the links to rewrite - */ - function rewriteInPQ($pq) { - $callback = $this->callback; + /** + * Rewrite URLs in a PHPQuery object. The content of the object will be modified. + * + * @param phpQuery $pq The content containing the links to rewrite + */ + public function rewriteInPQ($pq) + { + $callback = $this->callback; - // Make URLs absolute - foreach($this->tagMap as $tag => $attributes) { - foreach($pq[$tag] as $tagObj) { - foreach($attributes as $attribute) { - if($url = pq($tagObj)->attr($attribute)) { - $newURL = $callback($url); - pq($tagObj)->attr($attribute, $newURL); - } - } - } - } - } + // Make URLs absolute + foreach ($this->tagMap as $tag => $attributes) { + foreach ($pq[$tag] as $tagObj) { + foreach ($attributes as $attribute) { + if ($url = pq($tagObj)->attr($attribute)) { + $newURL = $callback($url); + pq($tagObj)->attr($attribute, $newURL); + } + } + } + } + } - /** - * Rewrite URLs in the given content snippet. Returns the updated content. - * - * @param phpQuery $pq The content containing the links to rewrite - */ - function rewriteInContent($content) { - $pq = phpQuery::newDocument($content); - $this->rewriteInPQ($pq); - return $pq->html(); - } - -} \ No newline at end of file + /** + * Rewrite URLs in the given content snippet. Returns the updated content. + * + * @param phpQuery $pq The content containing the links to rewrite + */ + public function rewriteInContent($content) + { + $pq = phpQuery::newDocument($content); + $this->rewriteInPQ($pq); + return $pq->html(); + } +} diff --git a/code/StaticSitePageTransformer.php b/code/StaticSitePageTransformer.php index efc39bc..9494b1c 100644 --- a/code/StaticSitePageTransformer.php +++ b/code/StaticSitePageTransformer.php @@ -1,91 +1,94 @@ ID, $parentObject->Title"); - Debug::message($item->AbsoluteURL); - } - - // Sleep for 100ms to reduce load on the remote server - usleep(100*1000); - - // Extract content from the page - $contentFields = $this->getContentFieldsAndSelectors($item); - - // Default value for Title - if(empty($contentFields['Title'])) { - $contentFields['Title'] = array('content' => $item->Name); - } - - // Default value for URL segment - if(empty($contentFields['URLSegment'])) { - $urlSegment = str_replace('/','', $item->Name); - $urlSegment = preg_replace('/\.[^.]*$/','',$urlSegment); - $urlSegment = str_replace('.','-', $item->Name); - $contentFields['URLSegment'] = array('content' => $urlSegment); - } - - $schema = $item->getSource()->getSchemaForURL($item->AbsoluteURL); - - $pageType = $schema->DataType; - - if(!$pageType) { - throw new Exception('Pagetype for migration schema is empty!'); - } - - // Create a page with the appropriate fields - $page = new $pageType(array()); - $existingPage = SiteTree::get_by_link($item->getExternalId()); - - if($existingPage && $duplicateStrategy === 'Overwrite') { - if(get_class($existingPage) !== $pageType) { - $existingPage->ClassName = $pageType; - $existingPage->write(); - } - if($existingPage) { - $page = $existingPage; - } - } - - $page->StaticSiteContentSourceID = $item->getSource()->ID; - $page->StaticSiteURL = $item->AbsoluteURL; - - $page->ParentID = $parentObject ? $parentObject->ID : 0; - - foreach($contentFields as $k => $v) { - $page->$k = $v['content']; - } - - $page->write(); - - if(Director::is_cli()) { - Debug::message("#$page->Title"); - Debug::message("#$page->ID child of #$page->ID"); - } - - return new TransformResult($page, $item->stageChildren()); - } - - /** - * Get content from the remote host - * - * @param StaticSiteeContentItem $item The item to extract - * @return array A map of field name => array('selector' => selector, 'content' => field content) - */ - public function getContentFieldsAndSelectors($item) { - // Get the import rules from the content source - $importSchema = $item->getSource()->getSchemaForURL($item->AbsoluteURL); - if(!$importSchema) { - return null; - throw new LogicException("Couldn't find an import schema for $item->AbsoluteURL"); - } - $importRules = $importSchema->getImportRules(); - - // Extract from the remote page based on those rules - $contentExtractor = new StaticSiteContentExtractor($item->AbsoluteURL); - - return $contentExtractor->extractMapAndSelectors($importRules); - } -} \ No newline at end of file +class StaticSitePageTransformer implements ExternalContentTransformer +{ + + public function transform($item, $parentObject, $duplicateStrategy) + { + if (Director::is_cli()) { + Debug::message("Parent: #$parentObject->ID, $parentObject->Title"); + Debug::message($item->AbsoluteURL); + } + + // Sleep for 100ms to reduce load on the remote server + usleep(100*1000); + + // Extract content from the page + $contentFields = $this->getContentFieldsAndSelectors($item); + + // Default value for Title + if (empty($contentFields['Title'])) { + $contentFields['Title'] = array('content' => $item->Name); + } + + // Default value for URL segment + if (empty($contentFields['URLSegment'])) { + $urlSegment = str_replace('/', '', $item->Name); + $urlSegment = preg_replace('/\.[^.]*$/', '', $urlSegment); + $urlSegment = str_replace('.', '-', $item->Name); + $contentFields['URLSegment'] = array('content' => $urlSegment); + } + + $schema = $item->getSource()->getSchemaForURL($item->AbsoluteURL); + + $pageType = $schema->DataType; + + if (!$pageType) { + throw new Exception('Pagetype for migration schema is empty!'); + } + + // Create a page with the appropriate fields + $page = new $pageType(array()); + $existingPage = SiteTree::get_by_link($item->getExternalId()); + + if ($existingPage && $duplicateStrategy === 'Overwrite') { + if (get_class($existingPage) !== $pageType) { + $existingPage->ClassName = $pageType; + $existingPage->write(); + } + if ($existingPage) { + $page = $existingPage; + } + } + + $page->StaticSiteContentSourceID = $item->getSource()->ID; + $page->StaticSiteURL = $item->AbsoluteURL; + + $page->ParentID = $parentObject ? $parentObject->ID : 0; + + foreach ($contentFields as $k => $v) { + $page->$k = $v['content']; + } + + $page->write(); + + if (Director::is_cli()) { + Debug::message("#$page->Title"); + Debug::message("#$page->ID child of #$page->ID"); + } + + return new TransformResult($page, $item->stageChildren()); + } + + /** + * Get content from the remote host + * + * @param StaticSiteeContentItem $item The item to extract + * @return array A map of field name => array('selector' => selector, 'content' => field content) + */ + public function getContentFieldsAndSelectors($item) + { + // Get the import rules from the content source + $importSchema = $item->getSource()->getSchemaForURL($item->AbsoluteURL); + if (!$importSchema) { + return null; + throw new LogicException("Couldn't find an import schema for $item->AbsoluteURL"); + } + $importRules = $importSchema->getImportRules(); + + // Extract from the remote page based on those rules + $contentExtractor = new StaticSiteContentExtractor($item->AbsoluteURL); + + return $contentExtractor->extractMapAndSelectors($importRules); + } +} diff --git a/code/StaticSiteUrlList.php b/code/StaticSiteUrlList.php index 69c0780..6e40fff 100644 --- a/code/StaticSiteUrlList.php +++ b/code/StaticSiteUrlList.php @@ -7,532 +7,606 @@ * * Makes use of PHPCrawl to prepare a list of URLs on the site */ -class StaticSiteUrlList { - protected $baseURL, $cacheDir; - - /** - * Two element array: contains keys 'inferred' and 'regular': - * - 'regular' is an array mapping raw URLs to processed URLs - * - 'inferred' is an array of inferred URLs - */ - protected $urls = null; - - protected $autoCrawl = false; - - protected $urlProcessor = null; - - protected $extraCrawlURLs = null; - - /** - * A list of regular expression patterns to exclude from scraping - * - * @var array - */ - protected $excludePatterns = array(); - - /** - * Create a new URL List - * @param string $baseURL The Base URL to find links on - * @param string $cacheDir The local path to cache data into - */ - function __construct($baseURL, $cacheDir) { - // baseURL mus not have a trailing slash - if(substr($baseURL,-1) == "/") $baseURL = substr($baseURL,0,-1); - // cacheDir must have a trailing slash - if(substr($cacheDir,-1) != "/") $cacheDir .= "/"; - - $this->baseURL = $baseURL; - $this->cacheDir = $cacheDir; - } - - /** - * Set a URL processor for this URL List. - * - * URL processors process the URLs before the site heirarchy and inferred meta-data are generated. - * These can be used to tranform URLs from CMSes that don't provide a natural heirarchy into something - * more useful. - * - * See {@link StaticSiteMOSSURLProcessor} for an example. - * - * @param StaticSiteUrlProcessor $urlProcessor [description] - */ - function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor) { - $this->urlProcessor = $urlProcessor; - } - - /** - * Define additional crawl URLs as an array - * Each of these URLs will be crawled in addition the base URL. - * This can be helpful if pages are getting missed by the crawl - */ - function setExtraCrawlURls($extraCrawlURLs) { - $this->extraCrawlURLs = $extraCrawlURLs; - } - - /** - * Return the additional crawl URLs as an array - */ - function getExtraCrawlURLs() { - return $this->extraCrawlURLs; - } - - /** - * Set an array of regular expression patterns that should be excluded from - * being added to the url list - * - * @param array $excludePatterns - */ - public function setExcludePatterns(array $excludePatterns) { - $this->excludePatterns = $excludePatterns; - } - - /** - * Get an array of regular expression patterns that should not be added to - * the url list - * - * @return array - */ - public function getExcludePatterns() { - return $this->excludePatterns; - } - - /** - * - * Set whether the crawl should be triggered on demand. - * @param [type] $autoCrawl [description] - */ - public function setAutoCrawl($autoCrawl) { - $this->autoCrawl = $autoCrawl; - } - - /** - * Returns the status of the spidering: "Complete", "Partial", or "Not started" - * @return [type] [description] - */ - public function getSpiderStatus() { - if(file_exists($this->cacheDir . 'urls')) { - if(file_exists($this->cacheDir . 'crawlerid')) return "Partial"; - else return "Complete"; - - } else { - return "Not started"; - } - } - - /** - * Return the number of URLs crawled so far - */ - public function getNumURLs() { - if($this->urls) { - $urls = $this->urls; - // Don't rely on loadUrls() as it chokes on partially completed imports - } else if(file_exists($this->cacheDir . 'urls')) { - $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); - } else { - return null; - } - - return sizeof(array_unique($urls['regular'])) + sizeof($urls['inferred']); - } - - /** - * Return the raw URLs as an array - * @return array - */ - public function getRawURLs() { - if($urls = $this->getProcessedURLs()) { - return array_keys($urls); - } - } - - /** - * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values - * @return array - */ - public function getProcessedURLs() { - if($this->hasCrawled() || $this->autoCrawl) { - if($this->urls === null) $this->loadUrls(); - return array_merge( - $this->urls['regular'], - $this->urls['inferred'] ? array_combine($this->urls['inferred'], $this->urls['inferred']) : array() - ); - } - } - - public function hasCrawled() { - // There are URLs and we're not in the middle of a crawl - return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid'); - } - - /** - * Load the URLs, either by crawling, or by fetching from cache - * @return void - */ - public function loadUrls() { - if($this->hasCrawled()) { - $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); - // Clear out obsolete format - if(!isset($this->urls['regular']) || !isset($this->urls['inferred'])) { - $this->urls = array('regular' => array(), 'inferred' => array()); - } - - } else if($this->autoCrawl) { - $this->crawl(); - - } else { - throw new LogicException("Crawl hasn't been executed yet, and autoCrawl is set to false"); - } - } - - /** - * Re-execute the URL processor on all the fetched URLs - * @return void - */ - public function reprocessUrls() { - if($this->urls === null) $this->loadUrls(); - - // Clear out all inferred URLs; these will be added - $this->urls['inferred'] = array(); - - // Reprocess URLs, in case the processing has changed since the last crawl - foreach($this->urls['regular'] as $url => $oldProcessed) { - $processedURL = $this->generateProcessedURL($url); - $this->urls['regular'][$url] = $processedURL; - - // Trigger parent URL back-filling on new processed URL - $this->parentProcessedURL($processedURL); - } - - $this->saveURLs(); - } - - /** - * - * @param int $limit - * @param bool $verbose - * @return \StaticSiteCrawler - */ - public function crawl($limit=false, $verbose=false) { - increase_time_limit_to(3600); - - if(!is_dir($this->cacheDir)) mkdir($this->cacheDir); - - $crawler = new StaticSiteCrawler($this, $limit, $verbose); - $crawler->enableResumption(); - $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); - $crawler->setWorkingDirectory($this->cacheDir); - - // Allow for resuming an incomplete crawl - if(file_exists($this->cacheDir.'crawlerid')) { - // We should re-load the partial list of URLs, if relevant - // This should only happen when we are resuming a partial crawl - if(file_exists($this->cacheDir . 'urls')) { - $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); - } else { - $this->urls = array('regular' => array(), 'inferred' => array()); - } - - $crawlerID = file_get_contents($this->cacheDir.'crawlerid'); - $crawler->resume($crawlerID); - } else { - $crawlerID = $crawler->getCrawlerId(); - file_put_contents($this->cacheDir.'/crawlerid', $crawlerID); - $this->urls = array('regular' => array(), 'inferred' => array()); - } - - $crawler->setURL($this->baseURL); - $crawler->go(); - - unlink($this->cacheDir.'crawlerid'); - - ksort($this->urls['regular']); - ksort($this->urls['inferred']); - $this->saveURLs(); - return $crawler; - } - - /** - * Save the current list of URLs to disk - * @return [type] [description] - */ - function saveURLs() { - file_put_contents($this->cacheDir . 'urls', serialize($this->urls)); - } - - /** - * Add a URL to this list, given the absolute URL - * @param string $url The absolute URL - */ - function addAbsoluteURL($url) { - $simpifiedURL = $this->simplifyURL($url); - $simpifiedBase = $this->simplifyURL($this->baseURL); - - if(substr($simpifiedURL,0,strlen($simpifiedBase)) == $simpifiedBase) { - $relURL = substr($url, strlen($this->baseURL)); - } else { - throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); - } - - return $this->addURL($relURL); - } - - function addURL($url) { - if($this->urls === null) $this->loadUrls(); - - // Generate and save the processed URLs - $this->urls['regular'][$url] = $this->generateProcessedURL($url); - - // Trigger parent URL back-filling - $this->parentProcessedURL($this->urls['regular'][$url]); - } - - - /** - * Add an inferred URL to the list. - * - * Since the unprocessed URL isn't available, we use the processed URL in its place. This should be used with - * some caution. - * - * @param string $processedURL The processed URL to add. - */ - function addInferredURL($inferredURL) { - if($this->urls === null) $this->loadUrls(); - - // Generate and save the processed URLs - $this->urls['inferred'][$inferredURL] = $inferredURL; - - // Trigger parent URL back-filling - $this->parentProcessedURL($inferredURL); - } - - ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - /** - * Return true if the given URL exists - * @param string $url The URL, either absolute, or relative starting with "/" - * @return boolean Does the URL exist - */ - function hasURL($url) { - if($this->urls === null) $this->loadUrls(); - - // Try and relativise an absolute URL - if($url[0] != '/') { - $simpifiedURL = $this->simplifyURL($url); - $simpifiedBase = $this->simplifyURL($this->baseURL); - - if(substr($simpifiedURL,0,strlen($simpifiedBase)) == $simpifiedBase) { - $url = substr($simpifiedURL, strlen($simpifiedBase)); - } else { - throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); - } - } - - return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); - } - - /** - * Simplify a URL. - * Ignores https/http differences and "www." / non differences. - * - * @param string $url - * @return string - */ - protected function simplifyURL($url) { - return preg_replace('#^https?://(www\.)?#i','http://www.', $url); - } - - /** - * Returns true if the given URL is in the list of processed URls - * - * @param string $processedURL The processed URL - * @return boolean True if it exists, false otherwise - */ - function hasProcessedURL($processedURL) { - if($this->urls === null) $this->loadUrls(); - - return in_array($processedURL, $this->urls['regular']) || in_array($processedURL, $this->urls['inferred']); - - } - - /** - * Return the processed URL that is the parent of the given one. - * - * Both input and output are processed URLs - * - * @param string $url A relative URL - * @return string [description] - */ - function parentProcessedURL($processedURL) { - if($processedURL == "/") return ""; - - // URL heirachy can be broken down by querystring or by URL - $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL,'/')); - - // Special case for children of the root - if($breakpoint == 0) return "/"; - - // Get parent URL - $parentProcessedURL = substr($processedURL,0,$breakpoint); - - // If an intermediary URL doesn't exist, create it - if(!$this->hasProcessedURL($parentProcessedURL)) $this->addInferredURL($parentProcessedURL); - - return $parentProcessedURL; - } - - /** - * Return the regular URL, given the processed one. - * - * Note that the URL processing isn't reversible, so this function works looks by iterating through all URLs. - * If the URL doesn't exist in the list, this function returns null. - * - * @param string $processedURL The URL after processing has been applied. - * @return string The original URL. - */ - function unprocessedURL($processedURL) { - if($url = array_search($processedURL, $this->urls['regular'])) { - return $url; - - } else if(in_array($processedURL, $this->urls['inferred'])) { - return $processedURL; - } else { - return null; - } - } - - /** - * Find the processed URL in the URL list - * @param [type] $url [description] - * @return [type] [description] - */ - function processedURL($url) { - if($this->urls === null) $this->loadUrls(); - - if(isset($this->urls['regular'][$url])) { - // Generate it if missing - if($this->urls['regular'][$url] === true) $this->urls['regular'][$url] = $this->generateProcessedURL($url); - return $this->urls['regular'][$url]; - - } elseif(in_array($url, $this->urls['inferred'])) { - return $url; - } - } - - /** - * Execute custom logic for processing URLs prior to heirachy generation. - * - * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. - * - * @param string $url The unprocessed URL - * @return string The processed URL - */ - function generateProcessedURL($url) { - if(!$url) throw new LogicException("Can't pass a blank URL to generateProcessedURL"); - if($this->urlProcessor) $url = $this->urlProcessor->processURL($url); - if(!$url) throw new LogicException(get_class($this->urlProcessor) . " returned a blank URL."); - return $url; - } - - /** - * Return the URLs that are a child of the given URL - * @param [type] $url [description] - * @return [type] [description] - */ - function getChildren($url) { - if($this->urls === null) $this->loadUrls(); - - $processedURL = $this->processedURL($url); - - // Subtly different regex if the URL ends in ? or / - if(preg_match('#[/?]$#',$processedURL)) $regEx = '#^'.preg_quote($processedURL,'#') . '[^/?]+$#'; - else $regEx = '#^'.preg_quote($processedURL,'#') . '[/?][^/?]+$#'; - - $children = array(); - foreach($this->urls['regular'] as $potentialChild => $potentialProcessedChild) { - if(preg_match($regEx, $potentialProcessedChild)) { - if(!isset($children[$potentialProcessedChild])) { - $children[$potentialProcessedChild] = $potentialChild; - } - } - } - foreach($this->urls['inferred'] as $potentialProcessedChild) { - if(preg_match($regEx, $potentialProcessedChild)) { - if(!isset($children[$potentialProcessedChild])) { - $children[$potentialProcessedChild] = $potentialProcessedChild; - } - } - } - - return array_values($children); - } +class StaticSiteUrlList +{ + protected $baseURL, $cacheDir; + + /** + * Two element array: contains keys 'inferred' and 'regular': + * - 'regular' is an array mapping raw URLs to processed URLs + * - 'inferred' is an array of inferred URLs + */ + protected $urls = null; + + protected $autoCrawl = false; + + protected $urlProcessor = null; + + protected $extraCrawlURLs = null; + + /** + * A list of regular expression patterns to exclude from scraping + * + * @var array + */ + protected $excludePatterns = array(); + + /** + * Create a new URL List + * @param string $baseURL The Base URL to find links on + * @param string $cacheDir The local path to cache data into + */ + public function __construct($baseURL, $cacheDir) + { + // baseURL mus not have a trailing slash + if (substr($baseURL, -1) == "/") { + $baseURL = substr($baseURL, 0, -1); + } + // cacheDir must have a trailing slash + if (substr($cacheDir, -1) != "/") { + $cacheDir .= "/"; + } + + $this->baseURL = $baseURL; + $this->cacheDir = $cacheDir; + } + + /** + * Set a URL processor for this URL List. + * + * URL processors process the URLs before the site heirarchy and inferred meta-data are generated. + * These can be used to tranform URLs from CMSes that don't provide a natural heirarchy into something + * more useful. + * + * See {@link StaticSiteMOSSURLProcessor} for an example. + * + * @param StaticSiteUrlProcessor $urlProcessor [description] + */ + public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor) + { + $this->urlProcessor = $urlProcessor; + } + + /** + * Define additional crawl URLs as an array + * Each of these URLs will be crawled in addition the base URL. + * This can be helpful if pages are getting missed by the crawl + */ + public function setExtraCrawlURls($extraCrawlURLs) + { + $this->extraCrawlURLs = $extraCrawlURLs; + } + + /** + * Return the additional crawl URLs as an array + */ + public function getExtraCrawlURLs() + { + return $this->extraCrawlURLs; + } + + /** + * Set an array of regular expression patterns that should be excluded from + * being added to the url list + * + * @param array $excludePatterns + */ + public function setExcludePatterns(array $excludePatterns) + { + $this->excludePatterns = $excludePatterns; + } + + /** + * Get an array of regular expression patterns that should not be added to + * the url list + * + * @return array + */ + public function getExcludePatterns() + { + return $this->excludePatterns; + } + + /** + * + * Set whether the crawl should be triggered on demand. + * @param [type] $autoCrawl [description] + */ + public function setAutoCrawl($autoCrawl) + { + $this->autoCrawl = $autoCrawl; + } + + /** + * Returns the status of the spidering: "Complete", "Partial", or "Not started" + * @return [type] [description] + */ + public function getSpiderStatus() + { + if (file_exists($this->cacheDir . 'urls')) { + if (file_exists($this->cacheDir . 'crawlerid')) { + return "Partial"; + } else { + return "Complete"; + } + } else { + return "Not started"; + } + } + + /** + * Return the number of URLs crawled so far + */ + public function getNumURLs() + { + if ($this->urls) { + $urls = $this->urls; + // Don't rely on loadUrls() as it chokes on partially completed imports + } elseif (file_exists($this->cacheDir . 'urls')) { + $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); + } else { + return null; + } + + return sizeof(array_unique($urls['regular'])) + sizeof($urls['inferred']); + } + + /** + * Return the raw URLs as an array + * @return array + */ + public function getRawURLs() + { + if ($urls = $this->getProcessedURLs()) { + return array_keys($urls); + } + } + + /** + * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values + * @return array + */ + public function getProcessedURLs() + { + if ($this->hasCrawled() || $this->autoCrawl) { + if ($this->urls === null) { + $this->loadUrls(); + } + return array_merge( + $this->urls['regular'], + $this->urls['inferred'] ? array_combine($this->urls['inferred'], $this->urls['inferred']) : array() + ); + } + } + + public function hasCrawled() + { + // There are URLs and we're not in the middle of a crawl + return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid'); + } + + /** + * Load the URLs, either by crawling, or by fetching from cache + * @return void + */ + public function loadUrls() + { + if ($this->hasCrawled()) { + $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); + // Clear out obsolete format + if (!isset($this->urls['regular']) || !isset($this->urls['inferred'])) { + $this->urls = array('regular' => array(), 'inferred' => array()); + } + } elseif ($this->autoCrawl) { + $this->crawl(); + } else { + throw new LogicException("Crawl hasn't been executed yet, and autoCrawl is set to false"); + } + } + + /** + * Re-execute the URL processor on all the fetched URLs + * @return void + */ + public function reprocessUrls() + { + if ($this->urls === null) { + $this->loadUrls(); + } + + // Clear out all inferred URLs; these will be added + $this->urls['inferred'] = array(); + + // Reprocess URLs, in case the processing has changed since the last crawl + foreach ($this->urls['regular'] as $url => $oldProcessed) { + $processedURL = $this->generateProcessedURL($url); + $this->urls['regular'][$url] = $processedURL; + + // Trigger parent URL back-filling on new processed URL + $this->parentProcessedURL($processedURL); + } + + $this->saveURLs(); + } + + /** + * + * @param int $limit + * @param bool $verbose + * @return \StaticSiteCrawler + */ + public function crawl($limit=false, $verbose=false) + { + increase_time_limit_to(3600); + + if (!is_dir($this->cacheDir)) { + mkdir($this->cacheDir); + } + + $crawler = new StaticSiteCrawler($this, $limit, $verbose); + $crawler->enableResumption(); + $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); + $crawler->setWorkingDirectory($this->cacheDir); + + // Allow for resuming an incomplete crawl + if (file_exists($this->cacheDir.'crawlerid')) { + // We should re-load the partial list of URLs, if relevant + // This should only happen when we are resuming a partial crawl + if (file_exists($this->cacheDir . 'urls')) { + $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); + } else { + $this->urls = array('regular' => array(), 'inferred' => array()); + } + + $crawlerID = file_get_contents($this->cacheDir.'crawlerid'); + $crawler->resume($crawlerID); + } else { + $crawlerID = $crawler->getCrawlerId(); + file_put_contents($this->cacheDir.'/crawlerid', $crawlerID); + $this->urls = array('regular' => array(), 'inferred' => array()); + } + + $crawler->setURL($this->baseURL); + $crawler->go(); + + unlink($this->cacheDir.'crawlerid'); + + ksort($this->urls['regular']); + ksort($this->urls['inferred']); + $this->saveURLs(); + return $crawler; + } + + /** + * Save the current list of URLs to disk + * @return [type] [description] + */ + public function saveURLs() + { + file_put_contents($this->cacheDir . 'urls', serialize($this->urls)); + } + /** + * Add a URL to this list, given the absolute URL + * @param string $url The absolute URL + */ + public function addAbsoluteURL($url) + { + $simpifiedURL = $this->simplifyURL($url); + $simpifiedBase = $this->simplifyURL($this->baseURL); + + if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { + $relURL = substr($url, strlen($this->baseURL)); + } else { + throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); + } + + return $this->addURL($relURL); + } + + public function addURL($url) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + // Generate and save the processed URLs + $this->urls['regular'][$url] = $this->generateProcessedURL($url); + + // Trigger parent URL back-filling + $this->parentProcessedURL($this->urls['regular'][$url]); + } + + + /** + * Add an inferred URL to the list. + * + * Since the unprocessed URL isn't available, we use the processed URL in its place. This should be used with + * some caution. + * + * @param string $processedURL The processed URL to add. + */ + public function addInferredURL($inferredURL) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + // Generate and save the processed URLs + $this->urls['inferred'][$inferredURL] = $inferredURL; + + // Trigger parent URL back-filling + $this->parentProcessedURL($inferredURL); + } + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Return true if the given URL exists + * @param string $url The URL, either absolute, or relative starting with "/" + * @return boolean Does the URL exist + */ + public function hasURL($url) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + // Try and relativise an absolute URL + if ($url[0] != '/') { + $simpifiedURL = $this->simplifyURL($url); + $simpifiedBase = $this->simplifyURL($this->baseURL); + + if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { + $url = substr($simpifiedURL, strlen($simpifiedBase)); + } else { + throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); + } + } + + return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); + } + + /** + * Simplify a URL. + * Ignores https/http differences and "www." / non differences. + * + * @param string $url + * @return string + */ + protected function simplifyURL($url) + { + return preg_replace('#^https?://(www\.)?#i', 'http://www.', $url); + } + + /** + * Returns true if the given URL is in the list of processed URls + * + * @param string $processedURL The processed URL + * @return boolean True if it exists, false otherwise + */ + public function hasProcessedURL($processedURL) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + return in_array($processedURL, $this->urls['regular']) || in_array($processedURL, $this->urls['inferred']); + } + + /** + * Return the processed URL that is the parent of the given one. + * + * Both input and output are processed URLs + * + * @param string $url A relative URL + * @return string [description] + */ + public function parentProcessedURL($processedURL) + { + if ($processedURL == "/") { + return ""; + } + + // URL heirachy can be broken down by querystring or by URL + $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/')); + + // Special case for children of the root + if ($breakpoint == 0) { + return "/"; + } + + // Get parent URL + $parentProcessedURL = substr($processedURL, 0, $breakpoint); + + // If an intermediary URL doesn't exist, create it + if (!$this->hasProcessedURL($parentProcessedURL)) { + $this->addInferredURL($parentProcessedURL); + } + + return $parentProcessedURL; + } + + /** + * Return the regular URL, given the processed one. + * + * Note that the URL processing isn't reversible, so this function works looks by iterating through all URLs. + * If the URL doesn't exist in the list, this function returns null. + * + * @param string $processedURL The URL after processing has been applied. + * @return string The original URL. + */ + public function unprocessedURL($processedURL) + { + if ($url = array_search($processedURL, $this->urls['regular'])) { + return $url; + } elseif (in_array($processedURL, $this->urls['inferred'])) { + return $processedURL; + } else { + return null; + } + } + + /** + * Find the processed URL in the URL list + * @param [type] $url [description] + * @return [type] [description] + */ + public function processedURL($url) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + if (isset($this->urls['regular'][$url])) { + // Generate it if missing + if ($this->urls['regular'][$url] === true) { + $this->urls['regular'][$url] = $this->generateProcessedURL($url); + } + return $this->urls['regular'][$url]; + } elseif (in_array($url, $this->urls['inferred'])) { + return $url; + } + } + + /** + * Execute custom logic for processing URLs prior to heirachy generation. + * + * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. + * + * @param string $url The unprocessed URL + * @return string The processed URL + */ + public function generateProcessedURL($url) + { + if (!$url) { + throw new LogicException("Can't pass a blank URL to generateProcessedURL"); + } + if ($this->urlProcessor) { + $url = $this->urlProcessor->processURL($url); + } + if (!$url) { + throw new LogicException(get_class($this->urlProcessor) . " returned a blank URL."); + } + return $url; + } + + /** + * Return the URLs that are a child of the given URL + * @param [type] $url [description] + * @return [type] [description] + */ + public function getChildren($url) + { + if ($this->urls === null) { + $this->loadUrls(); + } + + $processedURL = $this->processedURL($url); + + // Subtly different regex if the URL ends in ? or / + if (preg_match('#[/?]$#', $processedURL)) { + $regEx = '#^'.preg_quote($processedURL, '#') . '[^/?]+$#'; + } else { + $regEx = '#^'.preg_quote($processedURL, '#') . '[/?][^/?]+$#'; + } + + $children = array(); + foreach ($this->urls['regular'] as $potentialChild => $potentialProcessedChild) { + if (preg_match($regEx, $potentialProcessedChild)) { + if (!isset($children[$potentialProcessedChild])) { + $children[$potentialProcessedChild] = $potentialChild; + } + } + } + foreach ($this->urls['inferred'] as $potentialProcessedChild) { + if (preg_match($regEx, $potentialProcessedChild)) { + if (!isset($children[$potentialProcessedChild])) { + $children[$potentialProcessedChild] = $potentialProcessedChild; + } + } + } + + return array_values($children); + } } -class StaticSiteCrawler extends PHPCrawler { - protected $urlList; - - /** - * - * @var bool - */ - protected $verbose = false; - - function __construct(StaticSiteUrlList $urlList, $limit=false, $verbose=false) { - parent::__construct(); - $this->urlList = $urlList; - $this->verbose = $verbose; - if($limit) { - $this->setPageLimit($limit); - } - } - - function handleHeaderInfo(PHPCrawlerResponseHeader $header) { - // Don't parse 400/500 responses - if($header->http_status_code > 399) { - $message = $header->source_url . " - skipped as it's $header->http_status_code".PHP_EOL; - error_log($message, 3, '/tmp/urls'); - if($this->verbose) { - echo "[!] ".$message; - } - return -1; - } - } - - function handleDocumentInfo(PHPCrawlerDocumentInfo $info) { - // Ignore errors and redirects - if($info->http_status_code < 200) return; - if($info->http_status_code > 299) return; - - // Ignore non HTML - if(!preg_match('#/x?html#', $info->content_type)) return; - - $this->urlList->addAbsoluteURL($info->url); - if($this->verbose) { - echo "[+] ".$info->url.PHP_EOL; - } - $this->urlList->saveURLs(); - } - - protected function initCrawlerProcess() { - parent::initCrawlerProcess(); - - // Add additional URLs to crawl to the crawler's LinkCache - // NOTE: This is using an undocumented API - if($extraURLs = $this->urlList->getExtraCrawlURLs()) { - foreach($extraURLs as $extraURL) { - $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL)); - } - } - - // Prevent URLs that matches the exclude patterns to be fetched - if($excludePatterns = $this->urlList->getExcludePatterns()) { - foreach($excludePatterns as $pattern) { - $validRegExp = $this->addURLFilterRule('|'.str_replace('|', '\|', $pattern).'|'); - - if(!$validRegExp) { - throw new InvalidArgumentException('Exclude url pattern "'.$pattern.'" is not a valid regular expression.'); - } - } - } +class StaticSiteCrawler extends PHPCrawler +{ + protected $urlList; + + /** + * + * @var bool + */ + protected $verbose = false; + + public function __construct(StaticSiteUrlList $urlList, $limit=false, $verbose=false) + { + parent::__construct(); + $this->urlList = $urlList; + $this->verbose = $verbose; + if ($limit) { + $this->setPageLimit($limit); + } } -} \ No newline at end of file + + public function handleHeaderInfo(PHPCrawlerResponseHeader $header) + { + // Don't parse 400/500 responses + if ($header->http_status_code > 399) { + $message = $header->source_url . " - skipped as it's $header->http_status_code".PHP_EOL; + error_log($message, 3, '/tmp/urls'); + if ($this->verbose) { + echo "[!] ".$message; + } + return -1; + } + } + + public function handleDocumentInfo(PHPCrawlerDocumentInfo $info) + { + // Ignore errors and redirects + if ($info->http_status_code < 200) { + return; + } + if ($info->http_status_code > 299) { + return; + } + + // Ignore non HTML + if (!preg_match('#/x?html#', $info->content_type)) { + return; + } + + $this->urlList->addAbsoluteURL($info->url); + if ($this->verbose) { + echo "[+] ".$info->url.PHP_EOL; + } + $this->urlList->saveURLs(); + } + + protected function initCrawlerProcess() + { + parent::initCrawlerProcess(); + + // Add additional URLs to crawl to the crawler's LinkCache + // NOTE: This is using an undocumented API + if ($extraURLs = $this->urlList->getExtraCrawlURLs()) { + foreach ($extraURLs as $extraURL) { + $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL)); + } + } + + // Prevent URLs that matches the exclude patterns to be fetched + if ($excludePatterns = $this->urlList->getExcludePatterns()) { + foreach ($excludePatterns as $pattern) { + $validRegExp = $this->addURLFilterRule('|'.str_replace('|', '\|', $pattern).'|'); + + if (!$validRegExp) { + throw new InvalidArgumentException('Exclude url pattern "'.$pattern.'" is not a valid regular expression.'); + } + } + } + } +} diff --git a/code/StaticSiteUrlProcessor.php b/code/StaticSiteUrlProcessor.php index f762bf1..4b443f1 100644 --- a/code/StaticSiteUrlProcessor.php +++ b/code/StaticSiteUrlProcessor.php @@ -13,75 +13,88 @@ * * More sophisticated processing might be done to facilitate importing of less */ -interface StaticSiteUrlProcessor { +interface StaticSiteUrlProcessor +{ - /** - * Return a name for the style of URLs to be processed. - * - * This name will be shown in the CMS when users are configuring the content import. - * - * @return string The name, in plaintext (no HTML) - */ - function getName(); + /** + * Return a name for the style of URLs to be processed. + * + * This name will be shown in the CMS when users are configuring the content import. + * + * @return string The name, in plaintext (no HTML) + */ + public function getName(); - /** - * Return an explanation of what processing is done. - * - * This explanation will be shown in the CMS when users are configuring the content import. - * - * @return string The description, in plaintext (no HTML) - */ - function getDescription(); + /** + * Return an explanation of what processing is done. + * + * This explanation will be shown in the CMS when users are configuring the content import. + * + * @return string The description, in plaintext (no HTML) + */ + public function getDescription(); - /** - * Return a description for this processor, to be shown in the CMS. - * @param string $url The unprocessed URL - * @return string The name - */ - function processURL($url); + /** + * Return a description for this processor, to be shown in the CMS. + * @param string $url The unprocessed URL + * @return string The name + */ + public function processURL($url); } /** * Processor for MOSS URLs */ -class StaticSiteURLProcessor_DropExtensions implements StaticSiteUrlProcessor { - function getName() { - return "Simple clean-up (recommended)"; - } +class StaticSiteURLProcessor_DropExtensions implements StaticSiteUrlProcessor +{ + public function getName() + { + return "Simple clean-up (recommended)"; + } - function getDescription() { - return "Drop file extensions and trailing slashes on URLs but otherwise leave them the same"; - } + public function getDescription() + { + return "Drop file extensions and trailing slashes on URLs but otherwise leave them the same"; + } - function processURL($url) { - if(preg_match('/^([^?]*)\?(.*)$/', $url, $matches)) { - $url = $matches[1]; - $qs = $matches[2]; - if($url != '/') $url = preg_replace('#/$#','',$url); - $url = preg_replace('#\.[^.]*$#','',$url); - return "$url?$qs"; - } else { - if($url != '/') $url = preg_replace('#/$#','',$url); - $url = preg_replace('#\.[^.]*$#','',$url); - return $url; - } - } + public function processURL($url) + { + if (preg_match('/^([^?]*)\?(.*)$/', $url, $matches)) { + $url = $matches[1]; + $qs = $matches[2]; + if ($url != '/') { + $url = preg_replace('#/$#', '', $url); + } + $url = preg_replace('#\.[^.]*$#', '', $url); + return "$url?$qs"; + } else { + if ($url != '/') { + $url = preg_replace('#/$#', '', $url); + } + $url = preg_replace('#\.[^.]*$#', '', $url); + return $url; + } + } } /** * Processor for MOSS URLs */ -class StaticSiteMOSSURLProcessor extends StaticSiteURLProcessor_DropExtensions implements StaticSiteUrlProcessor { - function getName() { - return "MOSS-style URLs"; - } +class StaticSiteMOSSURLProcessor extends StaticSiteURLProcessor_DropExtensions implements StaticSiteUrlProcessor +{ + public function getName() + { + return "MOSS-style URLs"; + } - function getDescription() { - return "Remove '/Pages/' from the URL, and drop extensions"; - } + public function getDescription() + { + return "Remove '/Pages/' from the URL, and drop extensions"; + } - function processURL($url) { - $url = str_ireplace('/Pages/','/',$url); - return parent::processURL($url); - } + public function processURL($url) + { + $url = str_ireplace('/Pages/', '/', $url); + return parent::processURL($url); + } } diff --git a/code/tasks/ExternalContentImportContentTask.php b/code/tasks/ExternalContentImportContentTask.php index 3987544..5e06c3d 100644 --- a/code/tasks/ExternalContentImportContentTask.php +++ b/code/tasks/ExternalContentImportContentTask.php @@ -3,35 +3,37 @@ /** * External content - run import as a build task, importing content into a new container */ -class ExternalContentImportContentTask extends BuildTask { +class ExternalContentImportContentTask extends BuildTask +{ - function run($request) { - $id = $request->getVar('ID'); - if((!is_numeric($id) && !preg_match('/^[0-9]+_[0-9]+$/', $id)) || !$id) { - echo "Specify ?ID=(number) or ?ID=(ID)_(Code)
\n"; - return; - } + public function run($request) + { + $id = $request->getVar('ID'); + if ((!is_numeric($id) && !preg_match('/^[0-9]+_[0-9]+$/', $id)) || !$id) { + echo "Specify ?ID=(number) or ?ID=(ID)_(Code)
\n"; + return; + } - $includeSelected = false; - $includeChildren = true; - $duplicates = 'Duplicate'; - $selected = $id; + $includeSelected = false; + $includeChildren = true; + $duplicates = 'Duplicate'; + $selected = $id; - $target = new Page; - $target->Title = "Import on " . date('Y-m-d H:i:s'); - $target->write(); - $targetType = 'SiteTree'; + $target = new Page; + $target->Title = "Import on " . date('Y-m-d H:i:s'); + $target->write(); + $targetType = 'SiteTree'; - $from = ExternalContent::getDataObjectFor($selected); - if ($from instanceof ExternalContentSource) { - $selected = false; - } + $from = ExternalContent::getDataObjectFor($selected); + if ($from instanceof ExternalContentSource) { + $selected = false; + } - $importer = null; - $importer = $from->getContentImporter($targetType); + $importer = null; + $importer = $from->getContentImporter($targetType); - if ($importer) { - $importer->import($from, $target, $includeSelected, $includeChildren, $duplicates); - } - } + if ($importer) { + $importer->import($from, $target, $includeSelected, $includeChildren, $duplicates); + } + } } diff --git a/code/tasks/StaticSiteCrawlURLsTask.php b/code/tasks/StaticSiteCrawlURLsTask.php index 1366102..eeceb69 100644 --- a/code/tasks/StaticSiteCrawlURLsTask.php +++ b/code/tasks/StaticSiteCrawlURLsTask.php @@ -4,17 +4,18 @@ * StaticSiteCrawlURLs * */ -class StaticSiteCrawlURLsTask extends BuildTask { - - function run($request) { - $id = $request->getVar('ID'); - if(!is_numeric($id) || !$id) { - echo "Specify ?ID=(number)
"; - return; - } - // Find all pages - $contentSource = StaticSiteContentSource::get()->byID($id); - $contentSource->urllist()->crawl(false, true); - } +class StaticSiteCrawlURLsTask extends BuildTask +{ + public function run($request) + { + $id = $request->getVar('ID'); + if (!is_numeric($id) || !$id) { + echo "Specify ?ID=(number)
"; + return; + } + // Find all pages + $contentSource = StaticSiteContentSource::get()->byID($id); + $contentSource->urllist()->crawl(false, true); + } } diff --git a/code/tasks/StaticSiteRewriteLinksTask.php b/code/tasks/StaticSiteRewriteLinksTask.php index c91c902..1089fce 100644 --- a/code/tasks/StaticSiteRewriteLinksTask.php +++ b/code/tasks/StaticSiteRewriteLinksTask.php @@ -3,70 +3,71 @@ /** * Rewrite all links in content imported via staticsiteimporter */ -class StaticSiteRewriteLinksTask extends BuildTask { - - function run($request) { - $id = $request->getVar('ID'); - if(!is_numeric($id) || !$id) { - echo "Specify ?ID=(number)
"; - return; - } +class StaticSiteRewriteLinksTask extends BuildTask +{ + + public function run($request) + { + $id = $request->getVar('ID'); + if (!is_numeric($id) || !$id) { + echo "Specify ?ID=(number)
"; + return; + } - // Find all pages - $contentSource = StaticSiteContentSource::get()->byID($id); - $pages = $contentSource->Pages(); + // Find all pages + $contentSource = StaticSiteContentSource::get()->byID($id); + $pages = $contentSource->Pages(); - echo "Looking through " . $pages->Count() . " pages
\n"; + echo "Looking through " . $pages->Count() . " pages
\n"; - // Set up rewriter - $pageLookup = $pages->map('StaticSiteURL', 'ID'); - $baseURL = $contentSource->BaseUrl; + // Set up rewriter + $pageLookup = $pages->map('StaticSiteURL', 'ID'); + $baseURL = $contentSource->BaseUrl; - $rewriter = new StaticSiteLinkRewriter(function($url) use($pageLookup, $baseURL) { - $fragment = ""; - if(strpos($url,'#') !== false) { - list($url,$fragment) = explode('#', $url, 2); - $fragment = '#'.$fragment; - } + $rewriter = new StaticSiteLinkRewriter(function ($url) use ($pageLookup, $baseURL) { + $fragment = ""; + if (strpos($url, '#') !== false) { + list($url, $fragment) = explode('#', $url, 2); + $fragment = '#'.$fragment; + } - if($pageLookup[$url]) { - return '[sitetree_link,id='.$pageLookup[$url] .']' . $fragment; - - } else { - if(substr($url,0,strlen($baseURL)) == $baseURL) { - echo "WARNING: $url couldn't be rewritten.
\n"; - } - return $url . $fragment; - } - }); + if ($pageLookup[$url]) { + return '[sitetree_link,id='.$pageLookup[$url] .']' . $fragment; + } else { + if (substr($url, 0, strlen($baseURL)) == $baseURL) { + echo "WARNING: $url couldn't be rewritten.
\n"; + } + return $url . $fragment; + } + }); - // Perform rewriting - $changedFields = 0; - foreach($pages as $page) { + // Perform rewriting + $changedFields = 0; + foreach ($pages as $page) { + $schema = $contentSource->getSchemaForURL($page->URLSegment); + // Get fields to process + $fields = array(); + foreach ($schema->ImportRules() as $rule) { + if (!$rule->PlainText) { + $fields[] = $rule->FieldName; + } + } + $fields = array_unique($fields); + - $schema = $contentSource->getSchemaForURL($page->URLSegment); - // Get fields to process - $fields = array(); - foreach($schema->ImportRules() as $rule) { - if(!$rule->PlainText) $fields[] = $rule->FieldName; - } - $fields = array_unique($fields); - + foreach ($fields as $field) { + $newContent = $rewriter->rewriteInContent($page->$field); + if ($newContent != $page->$field) { + $newContent = str_replace(array('%5B', '%5D'), array('[', ']'), $newContent); + $changedFields++; - foreach($fields as $field) { - $newContent = $rewriter->rewriteInContent($page->$field); - if($newContent != $page->$field) { - $newContent = str_replace(array('%5B','%5D'),array('[',']'),$newContent); - $changedFields++; + echo "Changed $field on $page->Title (#$page->ID).
"; + $page->$field = $newContent; + } + } - echo "Changed $field on $page->Title (#$page->ID).
"; - $page->$field = $newContent; - } - } - - $page->write(); - } - echo "DONE. Amended $changedFields content fields.
".PHP_EOL; - - } -} \ No newline at end of file + $page->write(); + } + echo "DONE. Amended $changedFields content fields.
".PHP_EOL; + } +}