-
Notifications
You must be signed in to change notification settings - Fork 12
Home
姜 天戩 Mike Tian-Jian Jiang edited this page Jan 14, 2022
·
10 revisions
Welcome to the metadata wiki!
"""Usages of the three implementations.
ThreadPoolExecutor:
cc_html_infos, errs = get_htmls(cc_info_dicts, 120)
Parallel:
cc_html_infos, errs = get_htmls_in_parallel(cc_info_dicts, 120)
Async I/O:
cc_html_infos, errs = asyncio.get_event_loop().run_until_complete(get_htmls_in_uvloop(cc_info_dicts, 120))
"""
import concurrent.futures
import gzip
import io
from itertools import islice
import json
from aiohttp_retry import FibonacciRetry, RetryClient
import asyncio
from charset_normalizer import from_bytes
from joblib import Parallel, delayed
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import threading
from tqdm.auto import tqdm
def get_chunk_iter(iterable, size):
it = iter(iterable)
return iter(lambda: tuple(islice(it, size)), ())
_RETRY_STATUSES = (500, 502, 503, 504)
_RETRY_MAX = 4
async def get_html_in_aio(cc_info, aio_session):
url = f"https://commoncrawl.s3.amazonaws.com/{cc_info['warc_filename']}"
begin, length = int(cc_info["warc_record_offset"]), int(cc_info["warc_record_length"])
end = begin + length - 1
charset = cc_info["content_charset"]
if pd.isna(charset):
charset = "utf-8"
try:
async with aio_session.get(url, headers={"Range": f"bytes={begin}-{end}"}) as r:
if r.status != requests.codes.PARTIAL_CONTENT:
raise ValueError(f"{r.status}: {r.reason}")
content = await r.read()
with io.BytesIO(content) as gzf:
with gzip.GzipFile(fileobj=gzf) as f:
b = f.read()
try:
if charset in ("UTF-16", "UTF-16LE", "UTF-16BE"):
d = str(from_bytes(b).best()).strip()
else:
d = b.decode(charset, "backslashreplace").strip()
except LookupError as le:
print(f"{repr(le)}: {cc_info['url']}; switching to charset-normalizer")
d = str(from_bytes(b).best()).strip()
if d:
data_parts = d.split("\r\n\r\n", 2)
return data_parts[2].strip() if len(data_parts) == 3 else None
return None
except Exception as err:
raise RuntimeError(f"{repr(err)}: {cc_info['url']}") from err
async def get_htmls_in_uvloop(cc_infos, chunk_size=256):
cc_info_htmls, errs = [], []
add_cc_info_html, add_err = cc_info_htmls.append, errs.append
retry_options = FibonacciRetry(attempts=_RETRY_MAX, statuses=_RETRY_STATUSES)
async with RetryClient(raise_for_status=False, retry_options=retry_options) as rc:
for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
ftrs = await asyncio.gather(*(get_html_in_aio(cc_info, rc) for cc_info in cc_infos_chnk))
for html, cc_info in zip(ftrs, cc_infos_chnk):
try:
if html:
cc_info["html"] = html
add_cc_info_html(cc_info)
else:
raise ValueError(f"Empty HTML: {cc_info['url']}")
except Exception as err:
cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
add_err(cc_info)
print(repr(err))
return cc_info_htmls, errs
_THREAD_LOCAL = threading.local()
def _get_session():
if not hasattr(_THREAD_LOCAL, "session"):
rs = requests.Session()
http_adptr = HTTPAdapter(max_retries=Retry(
total=_RETRY_MAX, backoff_factor=1, status_forcelist=_RETRY_STATUSES
))
rs.mount("https://", http_adptr)
_THREAD_LOCAL.session = rs
return _THREAD_LOCAL.session
def get_html(cc_info):
url = f"https://commoncrawl.s3.amazonaws.com/{cc_info['warc_filename']}"
begin, length = int(cc_info["warc_record_offset"]), int(cc_info["warc_record_length"])
end = begin + length - 1
charset = cc_info["content_charset"]
if pd.isna(charset):
charset = "utf-8"
s = _get_session()
try:
with s.get(url, headers={"Range": f"bytes={begin}-{end}"}) as r:
if r.status_code != requests.codes.PARTIAL_CONTENT:
raise ValueError(f"{r.status_code}: {r.reason}")
with io.BytesIO(r.content) as gzf:
with gzip.GzipFile(fileobj=gzf) as f:
b = f.read()
try:
if charset in ("UTF-16", "UTF-16LE", "UTF-16BE"):
d = str(from_bytes(b).best()).strip()
else:
d = b.decode(charset, "backslashreplace").strip()
except LookupError as le:
print(f"{repr(le)}: {cc_info['url']}; switching to charset-normalizer")
d = str(from_bytes(b).best()).strip()
if d:
data_parts = d.split("\r\n\r\n", 2)
return data_parts[2].strip() if len(data_parts) == 3 else None
return None
except Exception as err:
raise RuntimeError(f"{repr(err)}: {cc_info['url']}") from err
def get_htmls_in_parallel(cc_infos, chunk_size=256):
cc_info_htmls, errs = [], []
add_cc_info_html, add_err = cc_info_htmls.append, errs.append
with Parallel(n_jobs=chunk_size, prefer="threads") as parallel:
for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
ftrs = parallel(delayed(get_html)(cc_info) for cc_info in cc_infos_chnk)
for html, cc_info in zip(ftrs, cc_infos_chnk):
try:
if html:
cc_info["html"] = html
add_cc_info_html(cc_info)
else:
raise ValueError(f"Empty HTML: {cc_info['url']}")
except Exception as err:
cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
add_err(cc_info)
print(repr(err))
return cc_info_htmls, errs
def get_htmls(cc_infos, chunk_size=256):
cc_info_htmls, errs = [], []
add_cc_info_html, add_err = cc_info_htmls.append, errs.append
for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
with concurrent.futures.ThreadPoolExecutor(max_workers=chunk_size) as exctr:
html_ftr_to_cc_info = {
exctr.submit(get_html, cc_info): cc_info for cc_info in cc_infos_chnk
}
for ftr in concurrent.futures.as_completed(html_ftr_to_cc_info):
cc_info = html_ftr_to_cc_info[ftr]
try:
html = ftr.result()
if html:
cc_info["html"] = html
add_cc_info_html(cc_info)
else:
raise ValueError(f"Empty HTML: {cc_info['url']}")
except Exception as err:
cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
add_err(cc_info)
print(repr(err))
return cc_info_htmls, errs
HtmlContentExtractor
class HtmlContentExtractor
{
private $dom;
protected $xpath;
/**
* Constructor
*
* @param \DOMDocument $dom DOMDocument
*
* @throws \Exception
*/
public function __construct(\DOMDocument $dom)
{
set_error_handler('Yaraku\Html\ErrorHandlerFunction');
libxml_use_internal_errors(true);
try {
$this->dom = $dom;
if (!$this->dom) {
throw new \Exception("DOMDocument is invalid.");
}
$this->dom->encoding = 'UTF-8';
$this->dom->formatOutput = false;
$this->xpath = new \DOMXPath($this->dom);
} catch (\Exception $e) {
restore_error_handler();
throw $e;
}
restore_error_handler();
}
/**
* Get the map of node path and text
*
* @return array
* @throws \Exception
*/
public function getNodePathAndTextMap()
{
$textArray = array();
$blocks = array();
$elements = $this->xpath->query(
"//*[name() != 'script' and name() != 'style'"
." and name() != 'code'"
." and not(@translate='no')]/text()"
);
$elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
/** @var \DOMNode $e */
foreach ($elementArray as $e) {
//$temp = self::whiteSpaceNormalization($e->C14N());
$temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
$temp = preg_replace("/<[^>]+>/u", "", $temp);
$temp = preg_replace("/[\s\d]+/u", "", $temp);
if ($temp !== '') {
$nodePath = $e->getNodePath();
$ancestorQuery
= "$nodePath/ancestor::p[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::a[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::div[not(descendant::table)"
." and not(descendant::div)"
." and not(descendant::code)"
." and normalize-space(text())]"
." | $nodePath/ancestor::font[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::span[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::li[not(descendant::table)"
." and not(descendant::li)"
." and not(descendant::code)]"
." | $nodePath/ancestor::dt[not(descendant::table)]"
." | $nodePath/ancestor::dd[not(descendant::table)]"
." | $nodePath/ancestor::td[not(descendant::table)"
." and not(descendant::div) "
." and not(descendant::code)"
." and normalize-space(text())]"
." | $nodePath/ancestor::th[not(descendant::table)]"
." | $nodePath/ancestor::b[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::i[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::u[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::tt[not(descendant::table)]"
." | $nodePath/ancestor::blockquote[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::strike[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::em[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::strong[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::iframe[not(descendant::table)"
." and not(descendant::div) and normalize-space(text())]"
;
$ancestors = $this->xpath->query($ancestorQuery);
if (!$ancestors) {
throw new \Exception(
"$ancestorQuery is an incorrect XPath query."
);
} elseif (0 === $ancestors->length) {
//$blocks[$nodePath] = $e->C14N();
$blocks[$nodePath] = $this->dom->saveHTML($e);
} else {
$isExtractedBlock = false;
for ($i = $ancestors->length - 1; $i >= 0; $i--) {
$blockNode = $ancestors->item($i);
//$blockOuterC14N = $blockNode->C14N();
$blockOuterC14N = $this->dom->saveHTML($blockNode);
$blockNodePath = $blockNode->getNodePath();
$headAndTheRest = preg_split(
"/^(<$blockNode->nodeName[^>]*>)/iu",
$blockOuterC14N,
2,
PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
);
$head = $headAndTheRest[0];
$tail = "</$blockNode->nodeName>";
$key = "$blockNodePath|$head|$tail";
if (array_key_exists($key, $blocks)) {
$isExtractedBlock = true;
break;
}
}
if ($isExtractedBlock) {
continue;
}
$blockNode = $ancestors->item($ancestors->length - 1);
//$blockOuterC14N = $blockNode->C14N();
$blockOuterC14N = $this->dom->saveHTML($blockNode);
$blockNodePath = $blockNode->getNodePath();
$headAndTheRest = preg_split(
"/^(<$blockNode->nodeName[^>]*>)/iu",
$blockOuterC14N,
2,
PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
);
$head = $headAndTheRest[0];
$tail = "</$blockNode->nodeName>";
$end = strripos($headAndTheRest[1], $tail);
$blockInnerC14N = substr($headAndTheRest[1], 0, $end);
$blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
}
}
}
$blocks = array_reverse($blocks);
foreach ($blocks as $path => $html) {
$textArray[$path] = str_replace('
', '', $html);
}
$metae = $this->xpath->query(
"/html/head/meta"
."[string(@content)"
." and ("
."@name='Description' or @name='description'"
." or @name='Keywords' or @name='keywords'"
.")]"
);
/** @var \DOMElement $meta */
foreach ($metae as $meta) {
if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
$textArray[$meta->getNodePath()]
= $meta->getAttribute("content");
}
}
$inputs = $this->xpath->query(
"//input"
."[string(@value)"
." and ("
."@type='button' or @type='Button'"
." or @type='reset' or @type='Reset'"
." or @type='search' or @type='Search'"
." or @type='submit' or @type='Submit'"
." or @type='text' or @type='Text'"
.")]"
);
/** @var \DOMElement $input */
foreach ($inputs as $input) {
if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
$textArray[$input->getNodePath()]
= $input->getAttribute("value");
}
}
$inputsWithPlaceholder
= $this->xpath->query("//input[string(@placeholder)]");
foreach ($inputsWithPlaceholder as $input) {
if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
) {
$textArray[$input->getNodePath() .'/@placeholder']
= $input->getAttribute("placeholder");
}
}
// $attributes =
// $this->m_xpath->query(
// "//*["
// ."string(@abbr) or string(@alt) or string(@label)"
// ." or string(@title) or string(@standby)"
// ." or string(@summary)"
// ."]");
// foreach ($attributes as $a)
// $textArray[$a->getNodePath()] = $a->C14N();
$attributeName = 'alt';
$attributes = $this->xpath->query(
"//*[string(@$attributeName)]/@$attributeName"
);
/** @var \DOMNode $a */
foreach ($attributes as $a) {
$textArray[$a->getNodePath()] = $a->nodeValue;
}
return $textArray;
}
/**
* Get text array
*
* @return array
*/
public function getTextArray()
{
return array_values($this->getNodePathAndTextMap());
}
/**
* @return array
* @throws \Exception
*/
public function getTextArrayWithLineNumber()
{
$textWithLineNumberList = [];
$nodePathAndTextMap = $this->getNodePathAndTextMap();
foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
$nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
/** @var \DOMNodeList $nodeList */
$nodeList = $this->xpath->query($nodePath);
$lineNumber = $nodeList->item(0)->getLineNo();
$textWithLineNumberList[] = [$lineNumber, $text];
}
return $textWithLineNumberList;
}
/**
* Get the map of node path and image
*
* @return array
*/
public function getNodePathAndImageMap()
{
$imageArray = array();
$images = $this->xpath->query("//img[string(@src)]");
/** @var \DOMNode $i */
foreach ($images as $i) {
$imageArray[$i->getNodePath()]
= $i->attributes->getNamedItem('src')->nodeValue;
}
return $imageArray;
}
/**
* Get image array
*
* @return array
*/
public function getImageArray()
{
return array_values($this->getNodePathAndImageMap());
}
/**
* Convert HTML to a one line string that can be used as Json variable
*
* @param string $html The html
* @param bool $jsonFriendly Prepare to use as Json variable
*
* @return string
*/
public static function htmlToOneLineString($html, $jsonFriendly=true)
{
$html = preg_replace('~>\s+<~', '><', $html);
$html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);
if ($jsonFriendly) {
$html = str_replace('"', '\"', $html);
}
return $html;
}
/**
* Encode the string into HTML Encoding format
*
* @param String $str Text String
*
* @return string
*/
public static function encode($str)
{
$str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');
$t = unpack("N*", $str);
$t = array_map(
function ($n) {
return "&#$n;";
}, $t
);
return implode("", $t);
}
/**
* Normalize white space inside the text
*
* @param String $text raw text
*
* @return String $text
* @throws \Exception
*/
public static function whiteSpaceNormalization($text)
{
// encode the text in decimal format
$text = self::encode($text);
// replace uncommon white space with ordinary white space
$text = preg_replace(
'/(\&\#5760\;|\&\#6158\;|'
.'\&\#8192\;|\&\#8193\;|'
.'\&\#8194\;|\&\#8195\;|\&\#8196\;|\&\#8197\;|\&\#8198\;|\&\#8199\;|'
.'\&\#8200\;|\&\#8201\;|\&\#8204\;|\&\#8205\;|\&\#8206\;|\&\#8207\;|'
.'\&\#8202\;|\&\#8239\;|\&\#8287\;|\&\#12288\;|\&\#10\;|'
.'\&\#11\;|\&\#12\;|\&\#13\;|\&\#133\;|\&\#8232\;|\&\#8233\;'
.'\&\#32\;|\&\#09\;|\&\#11\;|'
.'\&\#160\;|\&\#9\;)+/u',
" ",
$text
);
// if $text is null, there is something wrong with the preg_replace function
if (is_null($text)) {
$errorCode = preg_last_error();
throw new \Exception("preg_replace error code $errorCode.");
}
// decode the text again into the normal string
$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
// replace all common white space in named HTML entities with ordinary white
// space
// references:
// - http://www.w3schools.com/tags/ref_symbols.asp
// - http://www.w3schools.com/tags/ref_entities.asp
$text = trim(
preg_replace(
'/(\s|\ \;|\&\#xA0\;|\¨\;|\&\#xA8\;|\­\;|\&\#xAD\;|'
.'\¯\;|\&\#xAF\;|\´\;|\&\#xB4\;|\¸\;|\&\#xB8\;|'
.'\&ensp\;|\&\#x2002\;|\&emsp\;|\&\#x2003\;|\&thinsp\;|\&\#x2009\;|'
.'\&zwnj\;|\&\#x200C\;|\&zwj\;|\&\#x200D\;|\&lrm\;|\&\#x200E\;|'
.'\&rlm\;|\&\#x200F\;|\&\#xA\;|\
\;|\x{FEFF})+/u',
" ",
$text
)
);
return $text;
}
}
HtmlContentExtractorTest
class HtmlContentExtractorTest extends \PHPUnit_Framework_TestCase
{
protected $dataFolderPath;
protected $oInnPage;
protected $solarePage;
/** @var \DOMDocument $phpDom */
protected $phpDom;
/** @var \DOMDocument $html5Dom */
protected $html5Dom;
/** @var HtmlContentExtractor $extractor */
protected $extractor;
public function setUp()
{
parent::setUp();
$this->dataFolderPath
= __DIR__ . DIRECTORY_SEPARATOR .'_data'. DIRECTORY_SEPARATOR;
$this->oInnPage = $this->dataFolderPath .'www.o-inn.co.jp_index.html';
$this->solarePage = $this->dataFolderPath .'www.solarehotels.com.html';
}
public function testGetTextArrayWithLineNumber()
{
$this->extractor = new HtmlContentExtractor(
PhpDom::make($this->solarePage)
);
$textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
$firstPair = array_values($textWithLineNoList)[0];
$this->assertEquals([108, 'For Smileage member'], $firstPair);
$this->markTestSkipped('Masterminds\HTML5 has no support of it.');
$this->extractor = new HtmlContentExtractor(
Html5Dom::make($this->solarePage)
);
$textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
$firstPair = array_values($textWithLineNoList)[0];
$this->assertEquals([108, 'For Smileage member'], $firstPair);
}
/**
* Test get node path and text map
*
* @return void
*
* @ticket #108
* @ticket #109
* @ticket #136
*/
public function testGetNodePathAndTextMap()
{
$expectedMetaKeywordsNodePath = "/html/head/meta[5]";
$expectedMetaKeywordsText
= "Best Price Guarantee,Bottom Price,Lowest Price,Hotel,Stay,"
."Reservation,Booking,SOLARE HOTELS & RESORTS";
$doms = [
PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
$this->assertArrayHasKey(
$expectedMetaKeywordsNodePath,
$nodePathAndTextMap
);
$this->assertEquals(
$expectedMetaKeywordsText,
$nodePathAndTextMap[$expectedMetaKeywordsNodePath]
);
}
$expectedMetaKeywordsNodePath = "/html/head/meta[3]";
$expectedMetaKeywordsText
= "お茶の水イン,御茶ノ水,お茶の水,後楽園,"
."ビジネスホテル,文京区,東京ドーム,出張,宿泊予約";
$expectedInterpolatedCaseNodePath
= "/html/body/div[4]/div/div[2]/div[6]/p|<p>|</p>";
$expectedInterpolatedCaseText
= '掲載されている'
.'<a href="http://www.tripadvisor.jp/'
.'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
.'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
.' target="_blank">'
. PHP_EOL //< Masterminds\HTML5 seems not using source's EOL.
.' ホテルお茶の水イン'
.'</a>'
.'のクチコミはTripAdvisorより提供を受けています'
;
$doms = [
PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
$this->assertArrayHasKey(
$expectedMetaKeywordsNodePath,
$nodePathAndTextMap
);
$this->assertEquals(
$expectedMetaKeywordsText,
$nodePathAndTextMap[$expectedMetaKeywordsNodePath]
);
$this->assertArrayHasKey(
$expectedInterpolatedCaseNodePath,
$nodePathAndTextMap
);
$this->assertEquals(
$expectedInterpolatedCaseText,
$nodePathAndTextMap[$expectedInterpolatedCaseNodePath]
);
}
}
/**
* Test get text array
*
* @return void
*
* @ticket #108
* @ticket #109
* @ticket #136
*/
public function testGetTextArray()
{
$expectedCommonCase
= ' * 1...Only applicable to rates compared on the same date'
.' as the date of reservation made via the SORALRE HOTELS &'
.' RESORTS official website.<br>'."\n"
.' * 2...Limited to claims submitted via email within 24 hours'
.' of booking.';
$doms = [
PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$textArray = $this->extractor->getTextArray();
$this->assertContains($expectedCommonCase, $textArray);
}
$expectedCommonCase
= '<strong>お茶の水イン</strong><br>〒113-0034<br>'
.'東京都文京区湯島1-3-7<br>TEL:03-3813-8211<br>'
.'FAX:03-3813-9730<br>'
.'<a href="/transportation/">お茶の水インまでの地図</a>'
;
$expectedInterpolatedCase
= '掲載されている'
.'<a href="http://www.tripadvisor.jp/'
.'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
.'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
.' target="_blank">'
."\n ホテルお茶の水イン"
.'</a>'
.'のクチコミはTripAdvisorより提供を受けています'
;
$doms = [
PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$textArray = $this->extractor->getTextArray();
$this->assertContains($expectedCommonCase, $textArray);
$this->assertContains($expectedInterpolatedCase, $textArray);
}
}
/**
* @ticket #109
*/
/**
* Test get text array on ill formed html
*
* @return void
*/
public function testGetTextArrayOnIllFormedHtml()
{
$unpreparedHtml = file_get_contents($this->oInnPage);
$illformedCase
= '掲載されている'
.'<a href="http://www.tripadvisor.jp/'
.'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
.'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
."\n ホテルお茶の水イン</a>"
.'のクチコミはTripAdvisorより提供を受けています'
.'</a>'
;
$this->assertNotEquals(false, strpos($unpreparedHtml, $illformedCase));
$expectedCase
= '掲載されている'
. '<a href="http://www.tripadvisor.jp/'
. 'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
. 'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
. "\n ホテルお茶の水イン</a>"
. 'のクチコミはTripAdvisorより提供を受けています';
$doms = [
PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$textArray = $this->extractor->getTextArray();
$this->assertContains($expectedCase, $textArray);
}
}
/**
* Testing for placeholder attribute extraction from input tag
*
* @return void
*/
public function testGetNodePathAndTextMapOnPlaceholderAttributeOfInputTag()
{
$expectedPath = '/html/body/input/@placeholder';
$expectedText = 'お名前';
$file = $this->dataFolderPath .'inputPlaceholderTest.html';
$doms = [PhpDom::make($file), Html5Dom::make($file)];
foreach ($doms as $dom) {
$extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $extractor->getNodePathAndTextMap();
$this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
$this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
}
}
/**
* Testing for alt attribute extraction from image tag
*
* @return void
*
* @ticket LOC-2162
*/
public function testGetNodePathAndTextMapOnAltAttributeOfImageTag()
{
$expectedPath = '/html/body/img/@alt';
$expectedText = '画像です';
$file = $this->dataFolderPath .'imageAltTest.html';
$doms = [PhpDom::make($file), Html5Dom::make($file)];
foreach ($doms as $dom) {
$extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $extractor->getNodePathAndTextMap();
$this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
$this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
}
}
/**
* Test BOM removal
*
* @ticket ZEN-2579
*
* @return void
*/
public function testGetNodePathAndTextMapOnBOM()
{
$unexpectedPath = '/html/body/div[1]/text()[3]';
$file = $this->dataFolderPath
.'www.yokohamabay-sheraton.co.jp__other_facilities.html';
$doms = [PhpDom::make($file), Html5Dom::make($file)];
foreach ($doms as $dom) {
$extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $extractor->getNodePathAndTextMap();
$this->assertArrayNotHasKey($unexpectedPath, $nodePathAndTextMap);
}
}
// /**
// * Test sorting the node-path-to-text map by the line numbers of the HTML
// *
// * @return void
// */
// public function testSortNodePathAndTextMapByLineNum()
// {
// $file = $this->dataFolderPath .'replaceNodeXpath.html';
// $doms = [PhpDom::make($file), Html5Dom::make($file)];
// foreach ($doms as $dom) {
// $extractor = new HtmlContentExtractor($dom);
// $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
// $this->assertEquals(
// 'HTML Test', array_values($nodePathAndTextMap)[0]
// );
// $this->assertEquals(
// '選べるリージョンとゾーン',
// array_values($nodePathAndTextMap)[4]
// );
// }
// }
}
PartialHtmlWrapper
class PartialHtmlWrapper
{
const HEADER = <<<HTML_HEADER
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"></head><body>
HTML_HEADER;
const HEADER_C14N = <<<HTML_HEADER_C14N
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type">'</meta></head><body>
HTML_HEADER_C14N;
const FOOTER = <<<HTML_FOOTER
</body></html>
HTML_FOOTER;
/**
* Wrap the partial HTML
*
* @param string $partialHtml Partial HTML
*
* @return string
*/
public static function wrap($partialHtml)
{
return self::HEADER . $partialHtml . self::FOOTER;
}
/**
* Unwrap wrapped partial HTML
*
* @param string $wrappedPartialHtml Wrapped partial HTML
*
* @return string
*/
public static function unwrap($wrappedPartialHtml)
{
return
str_replace(
[self::HEADER_C14N, self::HEADER, self::FOOTER],
'',
$wrappedPartialHtml
);
}
}
English HTML sentence segmenter
class EnglishSentenceSegmenter extends AbstractSentenceSegmenter
{
private $_replacements;
protected $titles
= array(
"Mr.", "Ms.", "Mrs.", "Dr.", "Prof.",
"M.", "Ph.D.", "D.Phil.", "M.D.", "D.O.",
"Capt.", "Cpl.", "Sgt.", "Maj.", "Gen.", "Messrs."
);
protected $ellipsis = array("...", ". . .");
/**
* Constructor
*/
public function __construct($preserveSpaces=false)
{
parent::__construct($preserveSpaces);
mb_internal_encoding("UTF-8");
}
/**
* Pre-process
*
* @param string $rawHtml Raw HTML
*
* @return string
*/
public function preprocess($rawHtml)
{
$cookedText=$rawHtml;
if(!$this->_preserveSpaces) {
$cookedText = preg_replace("/[\r\n\s]+/u", " ", $cookedText);
$cookedText = trim($cookedText);
} else {
//Preserve white spaces at the beginning of the sentence
$cookedText = rtrim($cookedText);
}
unset($this->_replacements);
$this->_replacements = array();
$this->addHtmlTagAsQuote(
"span", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"font", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"a", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"li", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"h1", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"h2", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"h3", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"h4", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"h5", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"h6", $cookedText, $this->quoteKeys, $this->quotePairs
);
$this->addHtmlTagAsQuote(
"p", $cookedText, $this->quoteKeys, $this->quotePairs
);
$begin2OpenQuoteAndEndMap = array();
for ($i = 0; $i < count($this->quoteKeys); $i++) {
$openQuote = $this->quoteKeys[$i];
$offset = 0;
$begin = mb_strpos($cookedText, $openQuote, $offset);
while (false !== $begin) {
switch ($openQuote) {
case "\"":
case "'":
if ($begin > 0 && " " != $cookedText[$begin - 1]) {
$offset++;
break;
}
default:
$closeQuote = $this->quotePairs[$openQuote];
$offset = $begin + mb_strlen($openQuote);
$end = mb_strpos($cookedText, $closeQuote, $offset);
if (false !== $end) {
$offset = $end + mb_strlen($closeQuote);
$begin2OpenQuoteAndEndMap[$begin] = array($openQuote, $end);
}
}
$begin = mb_strpos($cookedText, $openQuote, $offset);
}
}
ksort($begin2OpenQuoteAndEndMap);
$previousEnds = array();
$begin2OpenQuoteMap = array();
foreach ($begin2OpenQuoteAndEndMap as $begin => $openQuoteAndEnd) {
$openQuote = $openQuoteAndEnd[0];
$end = $openQuoteAndEnd[1];
$isOverlapped = false;
foreach ($previousEnds as $previousEnd) {
if ($end < $previousEnd || $begin < $previousEnd) {
$isOverlapped = true;
break;
}
}
if (!$isOverlapped) {
$previousEnds[] = $end;
$begin2OpenQuoteMap[$begin] = $openQuote;
}
}
$offset = key($begin2OpenQuoteMap);
$count = 0;
foreach ($begin2OpenQuoteMap as $openQuote) {
$openQuoteBegin = mb_strpos($cookedText, $openQuote, $offset);
$openQuoteEnd = $openQuoteBegin + mb_strlen($openQuote);
$closeQuote = $this->quotePairs[$openQuote];
$closeQuoteBegin = mb_strpos($cookedText, $closeQuote, $openQuoteEnd);
$closeQuoteEnd = $closeQuoteBegin + mb_strlen($closeQuote);
$key = "<q". $count++ .">";
$this->_replacements[$key]
= mb_substr(
$cookedText,
$openQuoteBegin,
$closeQuoteEnd - $openQuoteBegin
);
$cookedText
= mb_substr($cookedText, 0, $openQuoteBegin)
.$key
.mb_substr($cookedText, $closeQuoteEnd);
$offset = $openQuoteBegin + strlen($key);
}
for ($i = 0; $i < count($this->titles); $i++) {
$cookedText = str_replace($this->titles[$i], "<t$i>", $cookedText);
$this->_replacements["<t$i>"] = $this->titles[$i];
}
for ($i = 0; $i < count($this->ellipsis); $i++) {
$cookedText = str_replace($this->ellipsis[$i], "<e$i>", $cookedText);
$this->_replacements["<e$i>"] = $this->ellipsis[$i];
}
return $cookedText;
}
/**
* Post-process
*
* @param string $cookedText Cooked text
*
* @return string
*/
public function postprocess($cookedText)
{
foreach ($this->_replacements as $key => $value) {
$pos = strpos($cookedText, $key);
if (false !== $pos) {
$cookedText
= substr_replace($cookedText, $value, $pos, strlen($key));
}
}
return $cookedText;
}
/**
* Get sentences
*
* @param string $rawHtml Raw HTML
*
* @return array
*/
public function getSentences($rawHtml)
{
$sentences = array();
$rawHtmlWithoutCrLf = $this->preprocess($rawHtml);
$begin = 0;
$offset = 0;
$matches = array();
$isMatched = 1;
while ($isMatched) {
$isMatched
= preg_match(
$this->fullStop,
$rawHtmlWithoutCrLf,
$matches,
PREG_OFFSET_CAPTURE,
$offset
);
if ($isMatched) {
$matchedFullStop = $matches[0][0];
$matchedPos = $matches[0][1];
$next = $matchedPos + strlen($matchedFullStop);
$isValid = false;
if ("." != $matchedFullStop) {
$isValid = true;
} else if ($matchedPos == strlen($rawHtmlWithoutCrLf)) {
$isValid = true;
} else if ($matchedPos + 2 < strlen($rawHtmlWithoutCrLf)
&& " " == $rawHtmlWithoutCrLf[$matchedPos + 1]
) {
$isValid = true;
}
if ($isValid) {
$extractedSentence
=substr(
$rawHtmlWithoutCrLf,
$begin,
$next - $begin
);
if($this->_preserveSpaces){
//Preserve white spaces at the beginning of the sentence
$extractedSentence=rtrim($extractedSentence);
} else {
$extractedSentence=trim($extractedSentence);
}
$sentences[]=$extractedSentence;
$begin = $next;
}
$offset = $next;
}
}
if ($begin < strlen($rawHtmlWithoutCrLf)) {
if($this->_preserveSpaces){
$sentences[] = rtrim(substr($rawHtmlWithoutCrLf, $begin));
} else {
$sentences[] = trim(substr($rawHtmlWithoutCrLf, $begin));
}
}
foreach ($sentences as &$sentence) {
$sentence = $this->postprocess($sentence);
}
unset($sentence);
return $sentences;
}
}
abstract class AbstractSentenceSegmenter
{
protected $_preserveSpaces;
protected $fullStop = '/[.?!]+/u';
protected $quoteKeys = array("\"", "'", "“", "‘", "(", "[", "{");
protected $quotePairs
= array(
"\"" => "\"", "'" => "'",
"“" => "”", "‘" => "’",
"(" => ")", "[" => "]", "{" => "}"
);
/**
* Constructor
*/
public function __construct($preserveSpaces=false)
{
$this->_preserveSpaces=$preserveSpaces;
}
/**
* Add HTML tag as quotation marks
*
* @param string $tagName Name of HTML tag
* @param string $cookedText The text to search
* @param array &$quoteKeys The reference of quoteKeys array
* @param array &$quotePairs The reference of quotePairs array
*
* @return void
*/
protected function addHtmlTagAsQuote(
$tagName, $cookedText, array &$quoteKeys, array &$quotePairs
) {
$matches = array();
$isFound
= preg_match_all(
"|(<". $tagName ."[^>]*>)[^<]*</". $tagName .">|ius",
$cookedText,
$matches
);
if (false !== $isFound && $isFound > 0) {
foreach ($matches[1] as $match) {
if (!in_array($match, $quoteKeys)) {
$quoteKeys[] = $match;
$quotePairs[$match] = '</'. $tagName .'>';
}
}
}
}
/**
* Pre-process
*
* @param string $rawHtml Raw HTML
*
* @return string
*/
abstract public function preprocess($rawHtml);
/**
* Post-process
*
* @param string $cookedText Cooked text
*
* @return string
*/
abstract public function postprocess($cookedText);
/**
* Get sentences
*
* @param string $rawHtml Raw HTML
*
* @return array
*/
abstract public function getSentences($rawHtml);
}