Skip to content
姜 天戩 Mike Tian-Jian Jiang edited this page Jan 14, 2022 · 10 revisions

Welcome to the metadata wiki!

Code snippets for concurrent download of WARC

"""Usages of the three implementations.

ThreadPoolExecutor:
    cc_html_infos, errs = get_htmls(cc_info_dicts, 120)

Parallel:
    cc_html_infos, errs = get_htmls_in_parallel(cc_info_dicts, 120)

Async I/O:
    cc_html_infos, errs = asyncio.get_event_loop().run_until_complete(get_htmls_in_uvloop(cc_info_dicts, 120))
"""
import concurrent.futures
import gzip
import io
from itertools import islice
import json

from aiohttp_retry import FibonacciRetry, RetryClient
import asyncio
from charset_normalizer import from_bytes
from joblib import Parallel, delayed
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import threading
from tqdm.auto import tqdm


def get_chunk_iter(iterable, size):
    it = iter(iterable)
    return iter(lambda: tuple(islice(it, size)), ())


_RETRY_STATUSES = (500, 502, 503, 504)
_RETRY_MAX = 4


async def get_html_in_aio(cc_info, aio_session):
    url = f"https://commoncrawl.s3.amazonaws.com/{cc_info['warc_filename']}"
    begin, length = int(cc_info["warc_record_offset"]), int(cc_info["warc_record_length"])
    end = begin + length - 1
    charset = cc_info["content_charset"]
    if pd.isna(charset):
        charset = "utf-8"

    try:
        async with aio_session.get(url, headers={"Range": f"bytes={begin}-{end}"}) as r:
            if r.status != requests.codes.PARTIAL_CONTENT:
                raise ValueError(f"{r.status}: {r.reason}")
            content = await r.read()
            with io.BytesIO(content) as gzf:
                with gzip.GzipFile(fileobj=gzf) as f:
                    b = f.read()
                    try:
                        if charset in ("UTF-16", "UTF-16LE", "UTF-16BE"):
                            d = str(from_bytes(b).best()).strip()
                        else:
                            d = b.decode(charset, "backslashreplace").strip()
                    except LookupError as le:
                        print(f"{repr(le)}: {cc_info['url']}; switching to charset-normalizer")
                        d = str(from_bytes(b).best()).strip()
                    if d:
                        data_parts = d.split("\r\n\r\n", 2)
                        return data_parts[2].strip() if len(data_parts) == 3 else None
                    return None
    except Exception as err:
        raise RuntimeError(f"{repr(err)}: {cc_info['url']}") from err


async def get_htmls_in_uvloop(cc_infos, chunk_size=256):
    cc_info_htmls, errs = [], []
    add_cc_info_html, add_err = cc_info_htmls.append, errs.append

    retry_options = FibonacciRetry(attempts=_RETRY_MAX, statuses=_RETRY_STATUSES)
    async with RetryClient(raise_for_status=False, retry_options=retry_options) as rc:
        for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
            ftrs = await asyncio.gather(*(get_html_in_aio(cc_info, rc) for cc_info in cc_infos_chnk))
            for html, cc_info in zip(ftrs, cc_infos_chnk):
                try:
                    if html:
                        cc_info["html"] = html
                        add_cc_info_html(cc_info)
                    else:
                        raise ValueError(f"Empty HTML: {cc_info['url']}")
                except Exception as err:
                    cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
                    add_err(cc_info)
                    print(repr(err))
        return cc_info_htmls, errs


_THREAD_LOCAL = threading.local()


def _get_session():
    if not hasattr(_THREAD_LOCAL, "session"):
        rs = requests.Session()
        http_adptr = HTTPAdapter(max_retries=Retry(
            total=_RETRY_MAX, backoff_factor=1, status_forcelist=_RETRY_STATUSES
        ))
        rs.mount("https://", http_adptr)
        _THREAD_LOCAL.session = rs
    return _THREAD_LOCAL.session


def get_html(cc_info):
    url = f"https://commoncrawl.s3.amazonaws.com/{cc_info['warc_filename']}"
    begin, length = int(cc_info["warc_record_offset"]), int(cc_info["warc_record_length"])
    end = begin + length - 1
    charset = cc_info["content_charset"]
    if pd.isna(charset):
        charset = "utf-8"

    s = _get_session()
    try:
        with s.get(url, headers={"Range": f"bytes={begin}-{end}"}) as r:
            if r.status_code != requests.codes.PARTIAL_CONTENT:
                raise ValueError(f"{r.status_code}: {r.reason}")
            with io.BytesIO(r.content) as gzf:
                with gzip.GzipFile(fileobj=gzf) as f:
                    b = f.read()
                    try:
                        if charset in ("UTF-16", "UTF-16LE", "UTF-16BE"):
                            d = str(from_bytes(b).best()).strip()
                        else:
                            d = b.decode(charset, "backslashreplace").strip()
                    except LookupError as le:
                        print(f"{repr(le)}: {cc_info['url']}; switching to charset-normalizer")
                        d = str(from_bytes(b).best()).strip()
                    if d:
                        data_parts = d.split("\r\n\r\n", 2)
                        return data_parts[2].strip() if len(data_parts) == 3 else None
                    return None
    except Exception as err:
        raise RuntimeError(f"{repr(err)}: {cc_info['url']}") from err


def get_htmls_in_parallel(cc_infos, chunk_size=256):
    cc_info_htmls, errs = [], []
    add_cc_info_html, add_err = cc_info_htmls.append, errs.append
    with Parallel(n_jobs=chunk_size, prefer="threads") as parallel:
        for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
            ftrs = parallel(delayed(get_html)(cc_info) for cc_info in cc_infos_chnk)
            for html, cc_info in zip(ftrs, cc_infos_chnk):
                try:
                    if html:
                        cc_info["html"] = html
                        add_cc_info_html(cc_info)
                    else:
                        raise ValueError(f"Empty HTML: {cc_info['url']}")
                except Exception as err:
                    cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
                    add_err(cc_info)
                    print(repr(err))
        return cc_info_htmls, errs


def get_htmls(cc_infos, chunk_size=256):
    cc_info_htmls, errs = [], []
    add_cc_info_html, add_err = cc_info_htmls.append, errs.append
    for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
        with concurrent.futures.ThreadPoolExecutor(max_workers=chunk_size) as exctr:
            html_ftr_to_cc_info = {
                exctr.submit(get_html, cc_info): cc_info for cc_info in cc_infos_chnk
            }
            for ftr in concurrent.futures.as_completed(html_ftr_to_cc_info):
                cc_info = html_ftr_to_cc_info[ftr]
                try:
                    html = ftr.result()
                    if html:
                        cc_info["html"] = html
                        add_cc_info_html(cc_info)
                    else:
                        raise ValueError(f"Empty HTML: {cc_info['url']}")
                except Exception as err:
                    cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
                    add_err(cc_info)
                    print(repr(err))
    return cc_info_htmls, errs

Some old PHP code for HTML text extraction

HtmlContentExtractor
class HtmlContentExtractor
{
    private $dom;
    protected $xpath;

    /**
     * Constructor
     *
     * @param \DOMDocument $dom DOMDocument
     *
     * @throws \Exception
     */
    public function __construct(\DOMDocument $dom)
    {
        set_error_handler('Yaraku\Html\ErrorHandlerFunction');
        libxml_use_internal_errors(true);

        try {
            $this->dom = $dom;
            if (!$this->dom) {
                throw new \Exception("DOMDocument is invalid.");
            }
            $this->dom->encoding = 'UTF-8';
            $this->dom->formatOutput = false;
            $this->xpath = new \DOMXPath($this->dom);
        } catch (\Exception $e) {
            restore_error_handler();
            throw $e;
        }

        restore_error_handler();
    }

    /**
     * Get the map of node path and text
     *
     * @return array
     * @throws \Exception
     */
    public function getNodePathAndTextMap()
    {
        $textArray = array();

        $blocks = array();
        $elements = $this->xpath->query(
            "//*[name() != 'script' and name() != 'style'"
            ." and name() != 'code'"
            ." and not(@translate='no')]/text()"
        );
        $elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
        /** @var \DOMNode $e */
        foreach ($elementArray as $e) {
            //$temp = self::whiteSpaceNormalization($e->C14N());
            $temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
            $temp = preg_replace("/<[^>]+>/u", "", $temp);
            $temp = preg_replace("/[\s\d]+/u", "", $temp);
            if ($temp !== '') {
                $nodePath = $e->getNodePath();
                $ancestorQuery
                    = "$nodePath/ancestor::p[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::a[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::div[not(descendant::table)"
                     ." and not(descendant::div)"
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::font[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::span[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::li[not(descendant::table)"
                     ." and not(descendant::li)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::dt[not(descendant::table)]"
                     ." | $nodePath/ancestor::dd[not(descendant::table)]"
                     ." | $nodePath/ancestor::td[not(descendant::table)"
                     ." and not(descendant::div) "
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::th[not(descendant::table)]"
                     ." | $nodePath/ancestor::b[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::i[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::u[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::tt[not(descendant::table)]"
                     ." | $nodePath/ancestor::blockquote[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strike[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::em[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strong[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::iframe[not(descendant::table)"
                     ." and not(descendant::div) and normalize-space(text())]"
                ;
                $ancestors = $this->xpath->query($ancestorQuery);
                if (!$ancestors) {
                    throw new \Exception(
                        "$ancestorQuery is an incorrect XPath query."
                    );
                } elseif (0 === $ancestors->length) {
                    //$blocks[$nodePath] = $e->C14N();
                    $blocks[$nodePath] = $this->dom->saveHTML($e);
                } else {
                    $isExtractedBlock = false;
                    for ($i = $ancestors->length - 1; $i >= 0; $i--) {
                        $blockNode = $ancestors->item($i);
                        //$blockOuterC14N = $blockNode->C14N();
                        $blockOuterC14N = $this->dom->saveHTML($blockNode);
                        $blockNodePath = $blockNode->getNodePath();
                        $headAndTheRest = preg_split(
                            "/^(<$blockNode->nodeName[^>]*>)/iu",
                            $blockOuterC14N,
                            2,
                            PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                        );
                        $head = $headAndTheRest[0];
                        $tail = "</$blockNode->nodeName>";

                        $key = "$blockNodePath|$head|$tail";
                        if (array_key_exists($key, $blocks)) {
                            $isExtractedBlock = true;
                            break;
                        }
                    }
                    if ($isExtractedBlock) {
                        continue;
                    }

                    $blockNode = $ancestors->item($ancestors->length - 1);
                    //$blockOuterC14N = $blockNode->C14N();
                    $blockOuterC14N = $this->dom->saveHTML($blockNode);
                    $blockNodePath = $blockNode->getNodePath();
                    $headAndTheRest = preg_split(
                        "/^(<$blockNode->nodeName[^>]*>)/iu",
                        $blockOuterC14N,
                        2,
                        PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                    );
                    $head = $headAndTheRest[0];
                    $tail = "</$blockNode->nodeName>";
                    $end = strripos($headAndTheRest[1], $tail);
                    $blockInnerC14N = substr($headAndTheRest[1], 0, $end);
                    $blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
                }
            }
        }
        $blocks = array_reverse($blocks);
        foreach ($blocks as $path => $html) {
            $textArray[$path] = str_replace('&#xD;', '', $html);
        }

        $metae = $this->xpath->query(
            "/html/head/meta"
            ."[string(@content)"
            ." and ("
            ."@name='Description' or @name='description'"
            ." or @name='Keywords' or @name='keywords'"
            .")]"
        );
        /** @var \DOMElement $meta */
        foreach ($metae as $meta) {
            if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
                $textArray[$meta->getNodePath()]
                    = $meta->getAttribute("content");
            }
        }

        $inputs = $this->xpath->query(
            "//input"
            ."[string(@value)"
            ." and ("
            ."@type='button' or @type='Button'"
            ." or @type='reset' or @type='Reset'"
            ." or @type='search' or @type='Search'"
            ." or @type='submit' or @type='Submit'"
            ." or @type='text' or @type='Text'"
            .")]"
        );
        /** @var \DOMElement $input */
        foreach ($inputs as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
                $textArray[$input->getNodePath()]
                    = $input->getAttribute("value");
            }
        }
        $inputsWithPlaceholder
            = $this->xpath->query("//input[string(@placeholder)]");
        foreach ($inputsWithPlaceholder as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
            ) {
                $textArray[$input->getNodePath() .'/@placeholder']
                    = $input->getAttribute("placeholder");
            }
        }
        //        $attributes =
        //            $this->m_xpath->query(
        //                "//*["
        //                    ."string(@abbr) or string(@alt) or string(@label)"
        //                    ." or string(@title) or string(@standby)"
        //                    ." or string(@summary)"
        //                ."]");
        //        foreach ($attributes as $a)
        //            $textArray[$a->getNodePath()] = $a->C14N();
        $attributeName = 'alt';
        $attributes = $this->xpath->query(
            "//*[string(@$attributeName)]/@$attributeName"
        );
        /** @var \DOMNode $a */
        foreach ($attributes as $a) {
            $textArray[$a->getNodePath()] = $a->nodeValue;
        }

        return $textArray;
    }

    /**
     * Get text array
     *
     * @return array
     */
    public function getTextArray()
    {
        return array_values($this->getNodePathAndTextMap());
    }

    /**
     * @return array
     * @throws \Exception
     */
    public function getTextArrayWithLineNumber()
    {
        $textWithLineNumberList = [];
        $nodePathAndTextMap = $this->getNodePathAndTextMap();
        foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
            $nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
            /** @var \DOMNodeList $nodeList */
            $nodeList = $this->xpath->query($nodePath);
            $lineNumber = $nodeList->item(0)->getLineNo();
            $textWithLineNumberList[] = [$lineNumber, $text];
        }

        return $textWithLineNumberList;
    }

    /**
     * Get the map of node path and image
     *
     * @return array
     */
    public function getNodePathAndImageMap()
    {
        $imageArray = array();

        $images = $this->xpath->query("//img[string(@src)]");
        /** @var \DOMNode $i */
        foreach ($images as $i) {
            $imageArray[$i->getNodePath()]
                = $i->attributes->getNamedItem('src')->nodeValue;
        }

        return $imageArray;
    }

    /**
     * Get image array
     *
     * @return array
     */
    public function getImageArray()
    {
        return array_values($this->getNodePathAndImageMap());
    }

    /**
     * Convert HTML to a one line string that can be used as Json variable
     *
     * @param string $html         The html
     * @param bool   $jsonFriendly Prepare to use as Json variable
     *
     * @return string
     */
    public static function htmlToOneLineString($html, $jsonFriendly=true)
    {
        $html = preg_replace('~>\s+<~', '><', $html);
        $html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);

        if ($jsonFriendly) {
            $html = str_replace('"', '\"', $html);
        }

        return $html;
    }

    /**
     * Encode the string into HTML Encoding format
     *
     * @param String $str Text String
     *
     * @return string
     */
    public static function encode($str)
    {
        $str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');

        $t = unpack("N*", $str);

        $t = array_map(
            function ($n) {
                return "&#$n;";
            }, $t
        );

        return implode("", $t);
    }

    /**
     * Normalize white space inside the text
     *
     * @param String $text raw text
     *
     * @return String $text
     * @throws \Exception
     */
    public static function whiteSpaceNormalization($text)
    {
        // encode the text in decimal format
        $text = self::encode($text);

        // replace uncommon white space with ordinary white space
        $text = preg_replace(
            '/(\&\#5760\;|\&\#6158\;|'
            .'\&\#8192\;|\&\#8193\;|'
            .'\&\#8194\;|\&\#8195\;|\&\#8196\;|\&\#8197\;|\&\#8198\;|\&\#8199\;|'
            .'\&\#8200\;|\&\#8201\;|\&\#8204\;|\&\#8205\;|\&\#8206\;|\&\#8207\;|'
            .'\&\#8202\;|\&\#8239\;|\&\#8287\;|\&\#12288\;|\&\#10\;|'
            .'\&\#11\;|\&\#12\;|\&\#13\;|\&\#133\;|\&\#8232\;|\&\#8233\;'
            .'\&\#32\;|\&\#09\;|\&\#11\;|'
            .'\&\#160\;|\&\#9\;)+/u',
            " ",
            $text
        );

        // if $text is null, there is something wrong with the preg_replace function
        if (is_null($text)) {
            $errorCode = preg_last_error();
            throw new \Exception("preg_replace error code $errorCode.");
        }

        // decode the text again into the normal string
        $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');

        // replace all common white space in named HTML entities with ordinary white
        // space
        // references:
        // - http://www.w3schools.com/tags/ref_symbols.asp
        // - http://www.w3schools.com/tags/ref_entities.asp
        $text = trim(
            preg_replace(
                '/(\s|\&nbsp\;|\&\#xA0\;|\&uml\;|\&\#xA8\;|\&shy\;|\&\#xAD\;|'
                .'\&macr\;|\&\#xAF\;|\&acute\;|\&\#xB4\;|\&cedil\;|\&\#xB8\;|'
                .'\&ensp\;|\&\#x2002\;|\&emsp\;|\&\#x2003\;|\&thinsp\;|\&\#x2009\;|'
                .'\&zwnj\;|\&\#x200C\;|\&zwj\;|\&\#x200D\;|\&lrm\;|\&\#x200E\;|'
                .'\&rlm\;|\&\#x200F\;|\&\#xA\;|\&#xD\;|\x{FEFF})+/u',
                " ",
                $text
            )
        );

        return $text;
    }
}
HtmlContentExtractorTest
class HtmlContentExtractorTest extends \PHPUnit_Framework_TestCase
{
    protected $dataFolderPath;
    protected $oInnPage;
    protected $solarePage;

    /** @var \DOMDocument $phpDom */
    protected $phpDom;

    /** @var \DOMDocument $html5Dom */
    protected $html5Dom;

    /** @var HtmlContentExtractor $extractor */
    protected $extractor;

    public function setUp()
    {
        parent::setUp();
        $this->dataFolderPath
            = __DIR__ . DIRECTORY_SEPARATOR .'_data'. DIRECTORY_SEPARATOR;
        $this->oInnPage = $this->dataFolderPath .'www.o-inn.co.jp_index.html';
        $this->solarePage = $this->dataFolderPath .'www.solarehotels.com.html';
    }

    public function testGetTextArrayWithLineNumber()
    {
        $this->extractor = new HtmlContentExtractor(
            PhpDom::make($this->solarePage)
        );
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);

        $this->markTestSkipped('Masterminds\HTML5 has no support of it.');
        $this->extractor = new HtmlContentExtractor(
            Html5Dom::make($this->solarePage)
        );
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);
    }

    /**
     * Test get node path and text map
     *
     * @return void
     *
     * @ticket #108
     * @ticket #109
     * @ticket #136
     */
    public function testGetNodePathAndTextMap()
    {
        $expectedMetaKeywordsNodePath = "/html/head/meta[5]";
        $expectedMetaKeywordsText
            = "Best Price Guarantee,Bottom Price,Lowest Price,Hotel,Stay,"
                ."Reservation,Booking,SOLARE HOTELS & RESORTS";
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey(
                $expectedMetaKeywordsNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedMetaKeywordsText,
                $nodePathAndTextMap[$expectedMetaKeywordsNodePath]
            );
        }

        $expectedMetaKeywordsNodePath = "/html/head/meta[3]";
        $expectedMetaKeywordsText
            = "お茶の水イン,御茶ノ水,お茶の水,後楽園,"
                ."ビジネスホテル,文京区,東京ドーム,出張,宿泊予約";
        $expectedInterpolatedCaseNodePath
            = "/html/body/div[4]/div/div[2]/div[6]/p|<p>|</p>";
        $expectedInterpolatedCaseText
            = '掲載されている'
                .'<a href="http://www.tripadvisor.jp/'
                    .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
                    .'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
                .' target="_blank">'
                . PHP_EOL //< Masterminds\HTML5 seems not using source's EOL.
                .'            ホテルお茶の水イン'
                .'</a>'
                .'のクチコミはTripAdvisorより提供を受けています'
            ;
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey(
                $expectedMetaKeywordsNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedMetaKeywordsText,
                $nodePathAndTextMap[$expectedMetaKeywordsNodePath]
            );
            $this->assertArrayHasKey(
                $expectedInterpolatedCaseNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedInterpolatedCaseText,
                $nodePathAndTextMap[$expectedInterpolatedCaseNodePath]
            );
        }
    }

    /**
     * Test get text array
     *
     * @return void
     *
     * @ticket #108
     * @ticket #109
     * @ticket #136
     */
    public function testGetTextArray()
    {
        $expectedCommonCase
            = '  * 1...Only applicable to rates compared on the same date'
            .' as the date of reservation made via the SORALRE HOTELS &amp;'
            .' RESORTS official website.<br>'."\n"
            .'  * 2...Limited to claims submitted via email within 24 hours'
            .' of booking.';
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
        }

        $expectedCommonCase
            = '<strong>お茶の水イン</strong><br>〒113-0034<br>'
                .'東京都文京区湯島1-3-7<br>TEL:03-3813-8211<br>'
                .'FAX:03-3813-9730<br>'
                .'<a href="/transportation/">お茶の水インまでの地図</a>'
            ;
        $expectedInterpolatedCase
            = '掲載されている'
                .'<a href="http://www.tripadvisor.jp/'
                .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
                .'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
                .' target="_blank">'
                ."\n            ホテルお茶の水イン"
                .'</a>'
                .'のクチコミはTripAdvisorより提供を受けています'
            ;
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
            $this->assertContains($expectedInterpolatedCase, $textArray);
        }
    }

    /**
     * @ticket #109
     */
    /**
     * Test get text array on ill formed html
     *
     * @return void
     */
    public function testGetTextArrayOnIllFormedHtml()
    {
        $unpreparedHtml = file_get_contents($this->oInnPage);
        $illformedCase
            = '掲載されている'
            .'<a href="http://www.tripadvisor.jp/'
            .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
            .'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
            ."\n            ホテルお茶の水イン</a>"
            .'のクチコミはTripAdvisorより提供を受けています'
            .'</a>'
        ;
        $this->assertNotEquals(false, strpos($unpreparedHtml, $illformedCase));

        $expectedCase
            = '掲載されている'
                . '<a href="http://www.tripadvisor.jp/'
                . 'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
                . 'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
                . "\n            ホテルお茶の水イン</a>"
                . 'のクチコミはTripAdvisorより提供を受けています';
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCase, $textArray);
        }
    }

    /**
     * Testing for placeholder attribute extraction from input tag
     *
     * @return void
     */
    public function testGetNodePathAndTextMapOnPlaceholderAttributeOfInputTag()
    {
        $expectedPath = '/html/body/input/@placeholder';
        $expectedText = 'お名前';
        $file = $this->dataFolderPath .'inputPlaceholderTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
        }
    }

    /**
     * Testing for alt attribute extraction from image tag
     *
     * @return void
     *
     * @ticket LOC-2162
     */
    public function testGetNodePathAndTextMapOnAltAttributeOfImageTag()
    {
        $expectedPath = '/html/body/img/@alt';
        $expectedText = '画像です';
        $file = $this->dataFolderPath .'imageAltTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
        }
    }

    /**
     * Test BOM removal
     *
     * @ticket ZEN-2579
     *
     * @return void
     */
    public function testGetNodePathAndTextMapOnBOM()
    {
        $unexpectedPath = '/html/body/div[1]/text()[3]';
        $file = $this->dataFolderPath
            .'www.yokohamabay-sheraton.co.jp__other_facilities.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayNotHasKey($unexpectedPath, $nodePathAndTextMap);
        }
    }

//    /**
//     * Test sorting the node-path-to-text map by the line numbers of the HTML
//     *
//     * @return void
//     */
//    public function testSortNodePathAndTextMapByLineNum()
//    {
//        $file = $this->dataFolderPath .'replaceNodeXpath.html';
//        $doms = [PhpDom::make($file), Html5Dom::make($file)];
//        foreach ($doms as $dom) {
//            $extractor = new HtmlContentExtractor($dom);
//            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
//            $this->assertEquals(
//                'HTML Test', array_values($nodePathAndTextMap)[0]
//            );
//            $this->assertEquals(
//                '選べるリージョンとゾーン',
//                array_values($nodePathAndTextMap)[4]
//            );
//        }
//    }

}
PartialHtmlWrapper
class PartialHtmlWrapper
{
    const HEADER = <<<HTML_HEADER
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"></head><body>
HTML_HEADER;

    const HEADER_C14N = <<<HTML_HEADER_C14N
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type">'</meta></head><body>
HTML_HEADER_C14N;

    const FOOTER = <<<HTML_FOOTER
</body></html>

HTML_FOOTER;

    /**
     * Wrap the partial HTML
     *
     * @param string $partialHtml Partial HTML
     *
     * @return string
     */
    public static function wrap($partialHtml)
    {
        return self::HEADER . $partialHtml . self::FOOTER;
    }

    /**
     * Unwrap wrapped partial HTML
     *
     * @param string $wrappedPartialHtml Wrapped partial HTML
     *
     * @return string
     */
    public static function unwrap($wrappedPartialHtml)
    {
        return
            str_replace(
                [self::HEADER_C14N, self::HEADER, self::FOOTER],
                '',
                $wrappedPartialHtml
            );
    }
}
English HTML sentence segmenter
class EnglishSentenceSegmenter extends AbstractSentenceSegmenter
{
    private $_replacements;

    protected $titles
        = array(
            "Mr.", "Ms.", "Mrs.", "Dr.", "Prof.",
            "M.", "Ph.D.", "D.Phil.", "M.D.", "D.O.",
            "Capt.", "Cpl.", "Sgt.", "Maj.", "Gen.", "Messrs."
        );
    protected $ellipsis = array("...", ". . .");

    /**
     * Constructor
     */
    public function __construct($preserveSpaces=false)
    {
        parent::__construct($preserveSpaces);
        mb_internal_encoding("UTF-8");
    }

    /**
     * Pre-process
     *
     * @param string $rawHtml Raw HTML
     *
     * @return string
     */
    public function preprocess($rawHtml)
    {
        $cookedText=$rawHtml;

        if(!$this->_preserveSpaces) {
            $cookedText = preg_replace("/[\r\n\s]+/u", " ", $cookedText);
            $cookedText = trim($cookedText);
        } else {
            //Preserve white spaces at the beginning of the sentence
            $cookedText = rtrim($cookedText);
        }



        unset($this->_replacements);
        $this->_replacements = array();
        
        $this->addHtmlTagAsQuote(
            "span", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "font", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "a", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "li", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h1", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h2", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h3", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h4", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h5", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h6", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "p", $cookedText, $this->quoteKeys, $this->quotePairs
        );

        $begin2OpenQuoteAndEndMap = array();
        for ($i = 0; $i < count($this->quoteKeys); $i++) {
            $openQuote = $this->quoteKeys[$i];
            $offset = 0;
            $begin = mb_strpos($cookedText, $openQuote, $offset);
            while (false !== $begin) {
                switch ($openQuote) {
                case "\"":
                case "'":
                    if ($begin > 0 && " " != $cookedText[$begin - 1]) {
                        $offset++;
                        break;
                    }
                default:
                    $closeQuote = $this->quotePairs[$openQuote];
                    $offset = $begin + mb_strlen($openQuote);
                    $end = mb_strpos($cookedText, $closeQuote, $offset);
                    if (false !== $end) {
                        $offset = $end + mb_strlen($closeQuote);
                        $begin2OpenQuoteAndEndMap[$begin] = array($openQuote, $end);
                    }
                }
                $begin = mb_strpos($cookedText, $openQuote, $offset);
            }
        }

        ksort($begin2OpenQuoteAndEndMap);
        $previousEnds = array();
        $begin2OpenQuoteMap = array();
        foreach ($begin2OpenQuoteAndEndMap as $begin => $openQuoteAndEnd) {
            $openQuote = $openQuoteAndEnd[0];
            $end = $openQuoteAndEnd[1];
            $isOverlapped = false;
            foreach ($previousEnds as $previousEnd) {
                if ($end < $previousEnd || $begin < $previousEnd) {
                    $isOverlapped = true;
                    break;
                }
            }
            
            if (!$isOverlapped) {
                $previousEnds[] = $end;
                $begin2OpenQuoteMap[$begin] = $openQuote;
            }
        }

        $offset = key($begin2OpenQuoteMap);
        $count = 0;
        foreach ($begin2OpenQuoteMap as $openQuote) {
            $openQuoteBegin = mb_strpos($cookedText, $openQuote, $offset);
            $openQuoteEnd = $openQuoteBegin + mb_strlen($openQuote);
            $closeQuote = $this->quotePairs[$openQuote];
            $closeQuoteBegin = mb_strpos($cookedText, $closeQuote, $openQuoteEnd);
            $closeQuoteEnd = $closeQuoteBegin + mb_strlen($closeQuote);
            $key = "<q". $count++ .">";
            $this->_replacements[$key]
                = mb_substr(
                    $cookedText,
                    $openQuoteBegin,
                    $closeQuoteEnd - $openQuoteBegin
                );
            $cookedText
                = mb_substr($cookedText, 0, $openQuoteBegin)
                    .$key
                    .mb_substr($cookedText, $closeQuoteEnd);
            $offset = $openQuoteBegin + strlen($key);
        }

        for ($i = 0; $i < count($this->titles); $i++) {
            $cookedText = str_replace($this->titles[$i], "<t$i>", $cookedText);
            $this->_replacements["<t$i>"] = $this->titles[$i];
        }

        for ($i = 0; $i < count($this->ellipsis); $i++) {
            $cookedText = str_replace($this->ellipsis[$i], "<e$i>", $cookedText);
            $this->_replacements["<e$i>"] = $this->ellipsis[$i];
        }

        return $cookedText;
    }

    /**
     * Post-process
     *
     * @param string $cookedText Cooked text
     *
     * @return string
     */
    public function postprocess($cookedText)
    {
        foreach ($this->_replacements as $key => $value) {
            $pos = strpos($cookedText, $key);
            if (false !== $pos) {
                $cookedText
                    = substr_replace($cookedText, $value, $pos, strlen($key));
            }

        }

        return $cookedText;
    }

    /**
     * Get sentences
     *
     * @param string $rawHtml Raw HTML
     *
     * @return array
     */
    public function getSentences($rawHtml)
    {
        $sentences = array();

        $rawHtmlWithoutCrLf = $this->preprocess($rawHtml);
        
        $begin = 0;
        $offset = 0;
        $matches = array();
        $isMatched = 1;

        while ($isMatched) {
            $isMatched
                = preg_match(
                    $this->fullStop,
                    $rawHtmlWithoutCrLf,
                    $matches,
                    PREG_OFFSET_CAPTURE,
                    $offset
                );
            if ($isMatched) {
                $matchedFullStop = $matches[0][0];
                $matchedPos = $matches[0][1];
                $next = $matchedPos + strlen($matchedFullStop);

                $isValid = false;
                if ("." != $matchedFullStop) {
                    $isValid = true;
                } else if ($matchedPos == strlen($rawHtmlWithoutCrLf)) {
                    $isValid = true;
                } else if ($matchedPos + 2 < strlen($rawHtmlWithoutCrLf)
                    && " " == $rawHtmlWithoutCrLf[$matchedPos + 1]
                ) {
                    $isValid = true;
                }

                if ($isValid) {
                    $extractedSentence
                        =substr(
                            $rawHtmlWithoutCrLf,
                            $begin,
                            $next - $begin
                        );

                    if($this->_preserveSpaces){
                        //Preserve white spaces at the beginning of the sentence
                        $extractedSentence=rtrim($extractedSentence);
                    } else {
                        $extractedSentence=trim($extractedSentence);
                    }
                    $sentences[]=$extractedSentence;

                    $begin = $next;
                }
                $offset = $next;
            }
        }
        
        if ($begin < strlen($rawHtmlWithoutCrLf)) {
            if($this->_preserveSpaces){
                $sentences[] = rtrim(substr($rawHtmlWithoutCrLf, $begin));
            } else {
                $sentences[] = trim(substr($rawHtmlWithoutCrLf, $begin));
            }
        }

        foreach ($sentences as &$sentence) {
            $sentence = $this->postprocess($sentence);
        }
        unset($sentence);

        return $sentences;
    }
}

abstract class AbstractSentenceSegmenter
{

    protected $_preserveSpaces;

    protected $fullStop = '/[.?!]+/u';
    protected $quoteKeys = array("\"", "'", "", "", "(", "[", "{");
    protected $quotePairs
        = array(
            "\"" => "\"", "'" => "'",
            ""  => "", "" => "",
            "("  => ")", "[" => "]", "{" => "}"
        );

    /**
     * Constructor
     */
    public function __construct($preserveSpaces=false)
    {
        $this->_preserveSpaces=$preserveSpaces;
    }

    /**
     * Add HTML tag as quotation marks
     * 
     * @param string $tagName     Name of HTML tag
     * @param string $cookedText  The text to search
     * @param array  &$quoteKeys  The reference of quoteKeys array
     * @param array  &$quotePairs The reference of quotePairs array
     * 
     * @return void
     */
    protected function addHtmlTagAsQuote(
        $tagName, $cookedText, array &$quoteKeys, array &$quotePairs
    ) {
        $matches = array();
        $isFound
            = preg_match_all(
                "|(<". $tagName ."[^>]*>)[^<]*</". $tagName .">|ius",
                $cookedText,
                $matches
            );
        if (false !== $isFound && $isFound > 0) {
            foreach ($matches[1] as $match) {
                if (!in_array($match, $quoteKeys)) {
                    $quoteKeys[] = $match;
                    $quotePairs[$match] = '</'. $tagName .'>';
                }
            }
        }
    }

    /**
     * Pre-process
     *
     * @param string $rawHtml Raw HTML
     *
     * @return string
     */
    abstract public function preprocess($rawHtml);

    /**
     * Post-process
     *
     * @param string $cookedText Cooked text
     *
     * @return string
     */
    abstract public function postprocess($cookedText);

    /**
     * Get sentences
     *
     * @param string $rawHtml Raw HTML
     *
     * @return array
     */
    abstract public function getSentences($rawHtml);
}
Clone this wiki locally