Skip to content

Commit

Permalink
Use DOM instead of SimpleXml for parsing HTML
Browse files Browse the repository at this point in the history
To get the RSD URL we parse a page's HTML, but this fails with
SimpleXml when there are non-XML entities (etc.), so this changes
MediawikiApi::newFromPage() to use the PHP DOM library instead.

A new exception class 'RsdException' was added, the docblock was
updated, and a test added.

Bug: https://phabricator.wikimedia.org/T163527
  • Loading branch information
samwilson authored and addshore committed Apr 26, 2017
1 parent 94a091c commit a5beebc
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 8 deletions.
29 changes: 21 additions & 8 deletions src/MediawikiApi.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

namespace Mediawiki\Api;

use DOMDocument;
use DOMXPath;
use GuzzleHttp\Client;
use GuzzleHttp\ClientInterface;
use GuzzleHttp\Exception\RequestException;
Expand Down Expand Up @@ -66,21 +68,32 @@ public static function newFromApiEndpoint( $apiEndpoint ) {
}

/**
* Create a new MediawikiApi object from a URL to any page in a MediaWiki website.
*
* @since 2.0
* @see https://en.wikipedia.org/wiki/Really_Simple_Discovery
*
* @param string $url e.g. https://en.wikipedia.org OR https://de.wikipedia.org/wiki/Berlin
*
* @return self returns a MediawikiApi instance using the apiEndpoint provided by the RSD
* file accessible on all Mediawiki pages
*
* @see https://en.wikipedia.org/wiki/Really_Simple_Discovery
* @throws RsdException If the RSD URL could not be found in the page's HTML.
*/
public static function newFromPage( $url ) {
$tempClient = new Client( array( 'headers' => array( 'User-Agent' => 'addwiki-mediawiki-client' ) ) );
$pageXml = new SimpleXMLElement( $tempClient->get( $url )->getBody() );
$rsdElement = $pageXml->xpath( 'head/link[@type="application/rsd+xml"][@href]' );
$rsdXml = new SimpleXMLElement( $tempClient->get( (string) $rsdElement[0]->attributes()['href'] )->getBody() );
return self::newFromApiEndpoint( (string) $rsdXml->service->apis->api->attributes()->apiLink );
$tempClient = new Client( [ 'headers' => [ 'User-Agent' => 'addwiki-mediawiki-client' ] ] );

// Get the page HTML and extract the RSD link.
$pageHtml = $tempClient->get( $url )->getBody();
$pageDoc = new DOMDocument();
$pageDoc->loadHTML( $pageHtml );
$link = ( new DOMXpath( $pageDoc ) )->query( 'head/link[@type="application/rsd+xml"][@href]' );
if ( $link->length === 0 ) {
throw new RsdException( "Unable to find RSD URL in page: $url" );
}
$rsdUrl = $link->item( 0 )->attributes->getnamedItem( 'href' )->nodeValue;

// Then get the RSD XML, and return the API link.
$rsdXml = new SimpleXMLElement( $tempClient->get( $rsdUrl )->getBody() );
return self::newFromApiEndpoint( (string)$rsdXml->service->apis->api->attributes()->apiLink );
}

/**
Expand Down
13 changes: 13 additions & 0 deletions src/RsdException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?php

namespace Mediawiki\Api;

use Exception;

/**
* An exception raised when an issue is encountered with Really Simple Discovery.
* @see https://en.wikipedia.org/wiki/Really_Simple_Discovery
*/
class RsdException extends Exception
{
}
12 changes: 12 additions & 0 deletions tests/Integration/MediawikiApiTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@ public function testNewFromPage() {
$this->assertInstanceOf( 'Mediawiki\Api\MediawikiApi', $api );
}

/**
* @covers Mediawiki\Api\MediawikiApi::newFromPage
* @expectedException Mediawiki\Api\RsdException
* @expectedExceptionMessageRegExp |Unable to find RSD URL in page.*|
*/
public function testNewFromPageInvalidHtml() {
// This could be any URL that doesn't contain the RSD link, but the README URL
// is a test-accessible one that doesn't return 404.
$nonWikiPage = str_replace( 'api.php', 'README', TestEnvironment::newInstance()->getApiUrl() );
MediawikiApi::newFromPage( $nonWikiPage );
}

/**
* @covers Mediawiki\Api\MediawikiApi::getRequest
* @covers Mediawiki\Api\MediawikiApi::getClientRequestOptions
Expand Down

0 comments on commit a5beebc

Please sign in to comment.