Home

Welcome to the metadata wiki!

Code snippets for concurrent download of WARC

"""Usages of the three implementations.

ThreadPoolExecutor:
    cc_html_infos, errs = get_htmls(cc_info_dicts, 120)

Parallel:
    cc_html_infos, errs = get_htmls_in_parallel(cc_info_dicts, 120)

Async I/O:
    cc_html_infos, errs = asyncio.get_event_loop().run_until_complete(get_htmls_in_uvloop(cc_info_dicts, 120))
"""
import concurrent.futures
import gzip
import io
from itertools import islice
import json

from aiohttp_retry import FibonacciRetry, RetryClient
import asyncio
from charset_normalizer import from_bytes
from joblib import Parallel, delayed
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import threading
from tqdm.auto import tqdm


def get_chunk_iter(iterable, size):
    it = iter(iterable)
    return iter(lambda: tuple(islice(it, size)), ())


_RETRY_STATUSES = (500, 502, 503, 504)
_RETRY_MAX = 4


async def get_html_in_aio(cc_info, aio_session):
    url = f"https://commoncrawl.s3.amazonaws.com/{cc_info['warc_filename']}"
    begin, length = int(cc_info["warc_record_offset"]), int(cc_info["warc_record_length"])
    end = begin + length - 1
    charset = cc_info["content_charset"]
    if pd.isna(charset):
        charset = "utf-8"

    try:
        async with aio_session.get(url, headers={"Range": f"bytes={begin}-{end}"}) as r:
            if r.status != requests.codes.PARTIAL_CONTENT:
                raise ValueError(f"{r.status}: {r.reason}")
            content = await r.read()
            with io.BytesIO(content) as gzf:
                with gzip.GzipFile(fileobj=gzf) as f:
                    b = f.read()
                    try:
                        if charset in ("UTF-16", "UTF-16LE", "UTF-16BE"):
                            d = str(from_bytes(b).best()).strip()
                        else:
                            d = b.decode(charset, "backslashreplace").strip()
                    except LookupError as le:
                        print(f"{repr(le)}: {cc_info['url']}; switching to charset-normalizer")
                        d = str(from_bytes(b).best()).strip()
                    if d:
                        data_parts = d.split("\r\n\r\n", 2)
                        return data_parts[2].strip() if len(data_parts) == 3 else None
                    return None
    except Exception as err:
        raise RuntimeError(f"{repr(err)}: {cc_info['url']}") from err


async def get_htmls_in_uvloop(cc_infos, chunk_size=256):
    cc_info_htmls, errs = [], []
    add_cc_info_html, add_err = cc_info_htmls.append, errs.append

    retry_options = FibonacciRetry(attempts=_RETRY_MAX, statuses=_RETRY_STATUSES)
    async with RetryClient(raise_for_status=False, retry_options=retry_options) as rc:
        for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
            ftrs = await asyncio.gather(*(get_html_in_aio(cc_info, rc) for cc_info in cc_infos_chnk))
            for html, cc_info in zip(ftrs, cc_infos_chnk):
                try:
                    if html:
                        cc_info["html"] = html
                        add_cc_info_html(cc_info)
                    else:
                        raise ValueError(f"Empty HTML: {cc_info['url']}")
                except Exception as err:
                    cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
                    add_err(cc_info)
                    print(repr(err))
        return cc_info_htmls, errs


_THREAD_LOCAL = threading.local()


def _get_session():
    if not hasattr(_THREAD_LOCAL, "session"):
        rs = requests.Session()
        http_adptr = HTTPAdapter(max_retries=Retry(
            total=_RETRY_MAX, backoff_factor=1, status_forcelist=_RETRY_STATUSES
        ))
        rs.mount("https://", http_adptr)
        _THREAD_LOCAL.session = rs
    return _THREAD_LOCAL.session


def get_html(cc_info):
    url = f"https://commoncrawl.s3.amazonaws.com/{cc_info['warc_filename']}"
    begin, length = int(cc_info["warc_record_offset"]), int(cc_info["warc_record_length"])
    end = begin + length - 1
    charset = cc_info["content_charset"]
    if pd.isna(charset):
        charset = "utf-8"

    s = _get_session()
    try:
        with s.get(url, headers={"Range": f"bytes={begin}-{end}"}) as r:
            if r.status_code != requests.codes.PARTIAL_CONTENT:
                raise ValueError(f"{r.status_code}: {r.reason}")
            with io.BytesIO(r.content) as gzf:
                with gzip.GzipFile(fileobj=gzf) as f:
                    b = f.read()
                    try:
                        if charset in ("UTF-16", "UTF-16LE", "UTF-16BE"):
                            d = str(from_bytes(b).best()).strip()
                        else:
                            d = b.decode(charset, "backslashreplace").strip()
                    except LookupError as le:
                        print(f"{repr(le)}: {cc_info['url']}; switching to charset-normalizer")
                        d = str(from_bytes(b).best()).strip()
                    if d:
                        data_parts = d.split("\r\n\r\n", 2)
                        return data_parts[2].strip() if len(data_parts) == 3 else None
                    return None
    except Exception as err:
        raise RuntimeError(f"{repr(err)}: {cc_info['url']}") from err


def get_htmls_in_parallel(cc_infos, chunk_size=256):
    cc_info_htmls, errs = [], []
    add_cc_info_html, add_err = cc_info_htmls.append, errs.append
    with Parallel(n_jobs=chunk_size, prefer="threads") as parallel:
        for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
            ftrs = parallel(delayed(get_html)(cc_info) for cc_info in cc_infos_chnk)
            for html, cc_info in zip(ftrs, cc_infos_chnk):
                try:
                    if html:
                        cc_info["html"] = html
                        add_cc_info_html(cc_info)
                    else:
                        raise ValueError(f"Empty HTML: {cc_info['url']}")
                except Exception as err:
                    cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
                    add_err(cc_info)
                    print(repr(err))
        return cc_info_htmls, errs


def get_htmls(cc_infos, chunk_size=256):
    cc_info_htmls, errs = [], []
    add_cc_info_html, add_err = cc_info_htmls.append, errs.append
    for cc_infos_chnk in tqdm(tuple(get_chunk_iter(cc_infos, chunk_size))):
        with concurrent.futures.ThreadPoolExecutor(max_workers=chunk_size) as exctr:
            html_ftr_to_cc_info = {
                exctr.submit(get_html, cc_info): cc_info for cc_info in cc_infos_chnk
            }
            for ftr in concurrent.futures.as_completed(html_ftr_to_cc_info):
                cc_info = html_ftr_to_cc_info[ftr]
                try:
                    html = ftr.result()
                    if html:
                        cc_info["html"] = html
                        add_cc_info_html(cc_info)
                    else:
                        raise ValueError(f"Empty HTML: {cc_info['url']}")
                except Exception as err:
                    cc_info["err"] = {"src": "get_htmls", "val": repr(err)}
                    add_err(cc_info)
                    print(repr(err))
    return cc_info_htmls, errs

Some old PHP code for HTML text extraction

HtmlContentExtractor

class HtmlContentExtractor
{
    private $dom;
    protected $xpath;

    /**
     * Constructor
     *
     * @param \DOMDocument $dom DOMDocument
     *
     * @throws \Exception
     */
    public function __construct(\DOMDocument $dom)
    {
        set_error_handler('Yaraku\Html\ErrorHandlerFunction');
        libxml_use_internal_errors(true);

        try {
            $this->dom = $dom;
            if (!$this->dom) {
                throw new \Exception("DOMDocument is invalid.");
            }
            $this->dom->encoding = 'UTF-8';
            $this->dom->formatOutput = false;
            $this->xpath = new \DOMXPath($this->dom);
        } catch (\Exception $e) {
            restore_error_handler();
            throw $e;
        }

        restore_error_handler();
    }

    /**
     * Get the map of node path and text
     *
     * @return array
     * @throws \Exception
     */
    public function getNodePathAndTextMap()
    {
        $textArray = array();

        $blocks = array();
        $elements = $this->xpath->query(
            "//*[name() != 'script' and name() != 'style'"
            ." and name() != 'code'"
            ." and not(@translate='no')]/text()"
        );
        $elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
        /** @var \DOMNode $e */
        foreach ($elementArray as $e) {
            //$temp = self::whiteSpaceNormalization($e->C14N());
            $temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
            $temp = preg_replace("/<[^>]+>/u", "", $temp);
            $temp = preg_replace("/[\s\d]+/u", "", $temp);
            if ($temp !== '') {
                $nodePath = $e->getNodePath();
                $ancestorQuery
                    = "$nodePath/ancestor::p[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::a[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::div[not(descendant::table)"
                     ." and not(descendant::div)"
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::font[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::span[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::li[not(descendant::table)"
                     ." and not(descendant::li)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::dt[not(descendant::table)]"
                     ." | $nodePath/ancestor::dd[not(descendant::table)]"
                     ." | $nodePath/ancestor::td[not(descendant::table)"
                     ." and not(descendant::div) "
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::th[not(descendant::table)]"
                     ." | $nodePath/ancestor::b[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::i[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::u[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::tt[not(descendant::table)]"
                     ." | $nodePath/ancestor::blockquote[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strike[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::em[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strong[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::iframe[not(descendant::table)"
                     ." and not(descendant::div) and normalize-space(text())]"
                ;
                $ancestors = $this->xpath->query($ancestorQuery);
                if (!$ancestors) {
                    throw new \Exception(
                        "$ancestorQuery is an incorrect XPath query."
                    );
                } elseif (0 === $ancestors->length) {
                    //$blocks[$nodePath] = $e->C14N();
                    $blocks[$nodePath] = $this->dom->saveHTML($e);
                } else {
                    $isExtractedBlock = false;
                    for ($i = $ancestors->length - 1; $i >= 0; $i--) {
                        $blockNode = $ancestors->item($i);
                        //$blockOuterC14N = $blockNode->C14N();
                        $blockOuterC14N = $this->dom->saveHTML($blockNode);
                        $blockNodePath = $blockNode->getNodePath();
                        $headAndTheRest = preg_split(
                            "/^(<$blockNode->nodeName[^>]*>)/iu",
                            $blockOuterC14N,
                            2,
                            PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                        );
                        $head = $headAndTheRest[0];
                        $tail = "</$blockNode->nodeName>";

                        $key = "$blockNodePath|$head|$tail";
                        if (array_key_exists($key, $blocks)) {
                            $isExtractedBlock = true;
                            break;
                        }
                    }
                    if ($isExtractedBlock) {
                        continue;
                    }

                    $blockNode = $ancestors->item($ancestors->length - 1);
                    //$blockOuterC14N = $blockNode->C14N();
                    $blockOuterC14N = $this->dom->saveHTML($blockNode);
                    $blockNodePath = $blockNode->getNodePath();
                    $headAndTheRest = preg_split(
                        "/^(<$blockNode->nodeName[^>]*>)/iu",
                        $blockOuterC14N,
                        2,
                        PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                    );
                    $head = $headAndTheRest[0];
                    $tail = "</$blockNode->nodeName>";
                    $end = strripos($headAndTheRest[1], $tail);
                    $blockInnerC14N = substr($headAndTheRest[1], 0, $end);
                    $blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
                }
            }
        }
        $blocks = array_reverse($blocks);
        foreach ($blocks as $path => $html) {
            $textArray[$path] = str_replace('&#xD;', '', $html);
        }

        $metae = $this->xpath->query(
            "/html/head/meta"
            ."[string(@content)"
            ." and ("
            ."@name='Description' or @name='description'"
            ." or @name='Keywords' or @name='keywords'"
            .")]"
        );
        /** @var \DOMElement $meta */
        foreach ($metae as $meta) {
            if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
                $textArray[$meta->getNodePath()]
                    = $meta->getAttribute("content");
            }
        }

        $inputs = $this->xpath->query(
            "//input"
            ."[string(@value)"
            ." and ("
            ."@type='button' or @type='Button'"
            ." or @type='reset' or @type='Reset'"
            ." or @type='search' or @type='Search'"
            ." or @type='submit' or @type='Submit'"
            ." or @type='text' or @type='Text'"
            .")]"
        );
        /** @var \DOMElement $input */
        foreach ($inputs as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
                $textArray[$input->getNodePath()]
                    = $input->getAttribute("value");
            }
        }
        $inputsWithPlaceholder
            = $this->xpath->query("//input[string(@placeholder)]");
        foreach ($inputsWithPlaceholder as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
            ) {
                $textArray[$input->getNodePath() .'/@placeholder']
                    = $input->getAttribute("placeholder");
            }
        }
        //        $attributes =
        //            $this->m_xpath->query(
        //                "//*["
        //                    ."string(@abbr) or string(@alt) or string(@label)"
        //                    ." or string(@title) or string(@standby)"
        //                    ." or string(@summary)"
        //                ."]");
        //        foreach ($attributes as $a)
        //            $textArray[$a->getNodePath()] = $a->C14N();
        $attributeName = 'alt';
        $attributes = $this->xpath->query(
            "//*[string(@$attributeName)]/@$attributeName"
        );
        /** @var \DOMNode $a */
        foreach ($attributes as $a) {
            $textArray[$a->getNodePath()] = $a->nodeValue;
        }

        return $textArray;
    }

    /**
     * Get text array
     *
     * @return array
     */
    public function getTextArray()
    {
        return array_values($this->getNodePathAndTextMap());
    }

    /**
     * @return array
     * @throws \Exception
     */
    public function getTextArrayWithLineNumber()
    {
        $textWithLineNumberList = [];
        $nodePathAndTextMap = $this->getNodePathAndTextMap();
        foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
            $nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
            /** @var \DOMNodeList $nodeList */
            $nodeList = $this->xpath->query($nodePath);
            $lineNumber = $nodeList->item(0)->getLineNo();
            $textWithLineNumberList[] = [$lineNumber, $text];
        }

        return $textWithLineNumberList;
    }

    /**
     * Get the map of node path and image
     *
     * @return array
     */
    public function getNodePathAndImageMap()
    {
        $imageArray = array();

        $images = $this->xpath->query("//img[string(@src)]");
        /** @var \DOMNode $i */
        foreach ($images as $i) {
            $imageArray[$i->getNodePath()]
                = $i->attributes->getNamedItem('src')->nodeValue;
        }

        return $imageArray;
    }

    /**
     * Get image array
     *
     * @return array
     */
    public function getImageArray()
    {
        return array_values($this->getNodePathAndImageMap());
    }

    /**
     * Convert HTML to a one line string that can be used as Json variable
     *
     * @param string $html         The html
     * @param bool   $jsonFriendly Prepare to use as Json variable
     *
     * @return string
     */
    public static function htmlToOneLineString($html, $jsonFriendly=true)
    {
        $html = preg_replace('~>\s+<~', '><', $html);
        $html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);

        if ($jsonFriendly) {
            $html = str_replace('"', '\"', $html);
        }

        return $html;
    }

    /**
     * Encode the string into HTML Encoding format
     *
     * @param String $str Text String
     *
     * @return string
     */
    public static function encode($str)
    {
        $str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');

        $t = unpack("N*", $str);

        $t = array_map(
            function ($n) {
                return "&#$n;";
            }, $t
        );

        return implode("", $t);
    }

    /**
     * Normalize white space inside the text
     *
     * @param String $text raw text
     *
     * @return String $text
     * @throws \Exception
     */
    public static function whiteSpaceNormalization($text)
    {
        // encode the text in decimal format
        $text = self::encode($text);

        // replace uncommon white space with ordinary white space
        $text = preg_replace(
            '/(\&\#5760\;|\&\#6158\;|'
            .'\&\#8192\;|\&\#8193\;|'
            .'\&\#8194\;|\&\#8195\;|\&\#8196\;|\&\#8197\;|\&\#8198\;|\&\#8199\;|'
            .'\&\#8200\;|\&\#8201\;|\&\#8204\;|\&\#8205\;|\&\#8206\;|\&\#8207\;|'
            .'\&\#8202\;|\&\#8239\;|\&\#8287\;|\&\#12288\;|\&\#10\;|'
            .'\&\#11\;|\&\#12\;|\&\#13\;|\&\#133\;|\&\#8232\;|\&\#8233\;'
            .'\&\#32\;|\&\#09\;|\&\#11\;|'
            .'\&\#160\;|\&\#9\;)+/u',
            " ",
            $text
        );

        // if $text is null, there is something wrong with the preg_replace function
        if (is_null($text)) {
            $errorCode = preg_last_error();
            throw new \Exception("preg_replace error code $errorCode.");
        }

        // decode the text again into the normal string
        $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');

        // replace all common white space in named HTML entities with ordinary white
        // space
        // references:
        // - http://www.w3schools.com/tags/ref_symbols.asp
        // - http://www.w3schools.com/tags/ref_entities.asp
        $text = trim(
            preg_replace(
                '/(\s|\&nbsp\;|\&\#xA0\;|\&uml\;|\&\#xA8\;|\&shy\;|\&\#xAD\;|'
                .'\&macr\;|\&\#xAF\;|\&acute\;|\&\#xB4\;|\&cedil\;|\&\#xB8\;|'
                .'\&ensp\;|\&\#x2002\;|\&emsp\;|\&\#x2003\;|\&thinsp\;|\&\#x2009\;|'
                .'\&zwnj\;|\&\#x200C\;|\&zwj\;|\&\#x200D\;|\&lrm\;|\&\#x200E\;|'
                .'\&rlm\;|\&\#x200F\;|\&\#xA\;|\&#xD\;|\x{FEFF})+/u',
                " ",
                $text
            )
        );

        return $text;
    }
}

HtmlContentExtractorTest

class HtmlContentExtractorTest extends \PHPUnit_Framework_TestCase
{
    protected $dataFolderPath;
    protected $oInnPage;
    protected $solarePage;

    /** @var \DOMDocument $phpDom */
    protected $phpDom;

    /** @var \DOMDocument $html5Dom */
    protected $html5Dom;

    /** @var HtmlContentExtractor $extractor */
    protected $extractor;

    public function setUp()
    {
        parent::setUp();
        $this->dataFolderPath
            = __DIR__ . DIRECTORY_SEPARATOR .'_data'. DIRECTORY_SEPARATOR;
        $this->oInnPage = $this->dataFolderPath .'www.o-inn.co.jp_index.html';
        $this->solarePage = $this->dataFolderPath .'www.solarehotels.com.html';
    }

    public function testGetTextArrayWithLineNumber()
    {
        $this->extractor = new HtmlContentExtractor(
            PhpDom::make($this->solarePage)
        );
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);

        $this->markTestSkipped('Masterminds\HTML5 has no support of it.');
        $this->extractor = new HtmlContentExtractor(
            Html5Dom::make($this->solarePage)
        );
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);
    }

    /**
     * Test get node path and text map
     *
     * @return void
     *
     * @ticket #108
     * @ticket #109
     * @ticket #136
     */
    public function testGetNodePathAndTextMap()
    {
        $expectedMetaKeywordsNodePath = "/html/head/meta[5]";
        $expectedMetaKeywordsText
            = "Best Price Guarantee,Bottom Price,Lowest Price,Hotel,Stay,"
                ."Reservation,Booking,SOLARE HOTELS & RESORTS";
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey(
                $expectedMetaKeywordsNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedMetaKeywordsText,
                $nodePathAndTextMap[$expectedMetaKeywordsNodePath]
            );
        }

        $expectedMetaKeywordsNodePath = "/html/head/meta[3]";
        $expectedMetaKeywordsText
            = "お茶の水イン,御茶ノ水,お茶の水,後楽園,"
                ."ビジネスホテル,文京区,東京ドーム,出張,宿泊予約";
        $expectedInterpolatedCaseNodePath
            = "/html/body/div[4]/div/div[2]/div[6]/p|<p>|</p>";
        $expectedInterpolatedCaseText
            = '掲載されている'
                .'<a href="http://www.tripadvisor.jp/'
                    .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
                    .'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
                .' target="_blank">'
                . PHP_EOL //< Masterminds\HTML5 seems not using source's EOL.
                .'            ホテルお茶の水イン'
                .'</a>'
                .'のクチコミはTripAdvisorより提供を受けています'
            ;
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey(
                $expectedMetaKeywordsNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedMetaKeywordsText,
                $nodePathAndTextMap[$expectedMetaKeywordsNodePath]
            );
            $this->assertArrayHasKey(
                $expectedInterpolatedCaseNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedInterpolatedCaseText,
                $nodePathAndTextMap[$expectedInterpolatedCaseNodePath]
            );
        }
    }

    /**
     * Test get text array
     *
     * @return void
     *
     * @ticket #108
     * @ticket #109
     * @ticket #136
     */
    public function testGetTextArray()
    {
        $expectedCommonCase
            = '　　* 1...Only applicable to rates compared on the same date'
            .' as the date of reservation made via the SORALRE HOTELS &amp;'
            .' RESORTS official website.<br>'."\n"
            .'　　* 2...Limited to claims submitted via email within 24 hours'
            .' of booking.';
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
        }

        $expectedCommonCase
            = '<strong>お茶の水イン</strong><br>〒113-0034<br>'
                .'東京都文京区湯島1-3-7<br>TEL：03-3813-8211<br>'
                .'FAX：03-3813-9730<br>'
                .'<a href="/transportation/">お茶の水インまでの地図</a>'
            ;
        $expectedInterpolatedCase
            = '掲載されている'
                .'<a href="http://www.tripadvisor.jp/'
                .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
                .'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
                .' target="_blank">'
                ."\n            ホテルお茶の水イン"
                .'</a>'
                .'のクチコミはTripAdvisorより提供を受けています'
            ;
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
            $this->assertContains($expectedInterpolatedCase, $textArray);
        }
    }

    /**
     * @ticket #109
     */
    /**
     * Test get text array on ill formed html
     *
     * @return void
     */
    public function testGetTextArrayOnIllFormedHtml()
    {
        $unpreparedHtml = file_get_contents($this->oInnPage);
        $illformedCase
            = '掲載されている'
            .'<a href="http://www.tripadvisor.jp/'
            .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
            .'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
            ."\n            ホテルお茶の水イン</a>"
            .'のクチコミはTripAdvisorより提供を受けています'
            .'</a>'
        ;
        $this->assertNotEquals(false, strpos($unpreparedHtml, $illformedCase));

        $expectedCase
            = '掲載されている'
                . '<a href="http://www.tripadvisor.jp/'
                . 'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
                . 'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
                . "\n            ホテルお茶の水イン</a>"
                . 'のクチコミはTripAdvisorより提供を受けています';
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCase, $textArray);
        }
    }

    /**
     * Testing for placeholder attribute extraction from input tag
     *
     * @return void
     */
    public function testGetNodePathAndTextMapOnPlaceholderAttributeOfInputTag()
    {
        $expectedPath = '/html/body/input/@placeholder';
        $expectedText = 'お名前';
        $file = $this->dataFolderPath .'inputPlaceholderTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
        }
    }

    /**
     * Testing for alt attribute extraction from image tag
     *
     * @return void
     *
     * @ticket LOC-2162
     */
    public function testGetNodePathAndTextMapOnAltAttributeOfImageTag()
    {
        $expectedPath = '/html/body/img/@alt';
        $expectedText = '画像です';
        $file = $this->dataFolderPath .'imageAltTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
        }
    }

    /**
     * Test BOM removal
     *
     * @ticket ZEN-2579
     *
     * @return void
     */
    public function testGetNodePathAndTextMapOnBOM()
    {
        $unexpectedPath = '/html/body/div[1]/text()[3]';
        $file = $this->dataFolderPath
            .'www.yokohamabay-sheraton.co.jp__other_facilities.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayNotHasKey($unexpectedPath, $nodePathAndTextMap);
        }
    }

//    /**
//     * Test sorting the node-path-to-text map by the line numbers of the HTML
//     *
//     * @return void
//     */
//    public function testSortNodePathAndTextMapByLineNum()
//    {
//        $file = $this->dataFolderPath .'replaceNodeXpath.html';
//        $doms = [PhpDom::make($file), Html5Dom::make($file)];
//        foreach ($doms as $dom) {
//            $extractor = new HtmlContentExtractor($dom);
//            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
//            $this->assertEquals(
//                'HTML Test', array_values($nodePathAndTextMap)[0]
//            );
//            $this->assertEquals(
//                '選べるリージョンとゾーン',
//                array_values($nodePathAndTextMap)[4]
//            );
//        }
//    }

}

PartialHtmlWrapper

class PartialHtmlWrapper
{
    const HEADER = <<<HTML_HEADER
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"></head><body>
HTML_HEADER;

    const HEADER_C14N = <<<HTML_HEADER_C14N
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type">'</meta></head><body>
HTML_HEADER_C14N;

    const FOOTER = <<<HTML_FOOTER
</body></html>

HTML_FOOTER;

    /**
     * Wrap the partial HTML
     *
     * @param string $partialHtml Partial HTML
     *
     * @return string
     */
    public static function wrap($partialHtml)
    {
        return self::HEADER . $partialHtml . self::FOOTER;
    }

    /**
     * Unwrap wrapped partial HTML
     *
     * @param string $wrappedPartialHtml Wrapped partial HTML
     *
     * @return string
     */
    public static function unwrap($wrappedPartialHtml)
    {
        return
            str_replace(
                [self::HEADER_C14N, self::HEADER, self::FOOTER],
                '',
                $wrappedPartialHtml
            );
    }
}

English HTML sentence segmenter

class EnglishSentenceSegmenter extends AbstractSentenceSegmenter
{
    private $_replacements;

    protected $titles
        = array(
            "Mr.", "Ms.", "Mrs.", "Dr.", "Prof.",
            "M.", "Ph.D.", "D.Phil.", "M.D.", "D.O.",
            "Capt.", "Cpl.", "Sgt.", "Maj.", "Gen.", "Messrs."
        );
    protected $ellipsis = array("...", ". . .");

    /**
     * Constructor
     */
    public function __construct($preserveSpaces=false)
    {
        parent::__construct($preserveSpaces);
        mb_internal_encoding("UTF-8");
    }

    /**
     * Pre-process
     *
     * @param string $rawHtml Raw HTML
     *
     * @return string
     */
    public function preprocess($rawHtml)
    {
        $cookedText=$rawHtml;

        if(!$this->_preserveSpaces) {
            $cookedText = preg_replace("/[\r\n\s]+/u", " ", $cookedText);
            $cookedText = trim($cookedText);
        } else {
            //Preserve white spaces at the beginning of the sentence
            $cookedText = rtrim($cookedText);
        }



        unset($this->_replacements);
        $this->_replacements = array();
        
        $this->addHtmlTagAsQuote(
            "span", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "font", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "a", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "li", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h1", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h2", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h3", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h4", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h5", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "h6", $cookedText, $this->quoteKeys, $this->quotePairs
        );
        $this->addHtmlTagAsQuote(
            "p", $cookedText, $this->quoteKeys, $this->quotePairs
        );

        $begin2OpenQuoteAndEndMap = array();
        for ($i = 0; $i < count($this->quoteKeys); $i++) {
            $openQuote = $this->quoteKeys[$i];
            $offset = 0;
            $begin = mb_strpos($cookedText, $openQuote, $offset);
            while (false !== $begin) {
                switch ($openQuote) {
                case "\"":
                case "'":
                    if ($begin > 0 && " " != $cookedText[$begin - 1]) {
                        $offset++;
                        break;
                    }
                default:
                    $closeQuote = $this->quotePairs[$openQuote];
                    $offset = $begin + mb_strlen($openQuote);
                    $end = mb_strpos($cookedText, $closeQuote, $offset);
                    if (false !== $end) {
                        $offset = $end + mb_strlen($closeQuote);
                        $begin2OpenQuoteAndEndMap[$begin] = array($openQuote, $end);
                    }
                }
                $begin = mb_strpos($cookedText, $openQuote, $offset);
            }
        }

        ksort($begin2OpenQuoteAndEndMap);
        $previousEnds = array();
        $begin2OpenQuoteMap = array();
        foreach ($begin2OpenQuoteAndEndMap as $begin => $openQuoteAndEnd) {
            $openQuote = $openQuoteAndEnd[0];
            $end = $openQuoteAndEnd[1];
            $isOverlapped = false;
            foreach ($previousEnds as $previousEnd) {
                if ($end < $previousEnd || $begin < $previousEnd) {
                    $isOverlapped = true;
                    break;
                }
            }
            
            if (!$isOverlapped) {
                $previousEnds[] = $end;
                $begin2OpenQuoteMap[$begin] = $openQuote;
            }
        }

        $offset = key($begin2OpenQuoteMap);
        $count = 0;
        foreach ($begin2OpenQuoteMap as $openQuote) {
            $openQuoteBegin = mb_strpos($cookedText, $openQuote, $offset);
            $openQuoteEnd = $openQuoteBegin + mb_strlen($openQuote);
            $closeQuote = $this->quotePairs[$openQuote];
            $closeQuoteBegin = mb_strpos($cookedText, $closeQuote, $openQuoteEnd);
            $closeQuoteEnd = $closeQuoteBegin + mb_strlen($closeQuote);
            $key = "<q". $count++ .">";
            $this->_replacements[$key]
                = mb_substr(
                    $cookedText,
                    $openQuoteBegin,
                    $closeQuoteEnd - $openQuoteBegin
                );
            $cookedText
                = mb_substr($cookedText, 0, $openQuoteBegin)
                    .$key
                    .mb_substr($cookedText, $closeQuoteEnd);
            $offset = $openQuoteBegin + strlen($key);
        }

        for ($i = 0; $i < count($this->titles); $i++) {
            $cookedText = str_replace($this->titles[$i], "<t$i>", $cookedText);
            $this->_replacements["<t$i>"] = $this->titles[$i];
        }

        for ($i = 0; $i < count($this->ellipsis); $i++) {
            $cookedText = str_replace($this->ellipsis[$i], "<e$i>", $cookedText);
            $this->_replacements["<e$i>"] = $this->ellipsis[$i];
        }

        return $cookedText;
    }

    /**
     * Post-process
     *
     * @param string $cookedText Cooked text
     *
     * @return string
     */
    public function postprocess($cookedText)
    {
        foreach ($this->_replacements as $key => $value) {
            $pos = strpos($cookedText, $key);
            if (false !== $pos) {
                $cookedText
                    = substr_replace($cookedText, $value, $pos, strlen($key));
            }

        }

        return $cookedText;
    }

    /**
     * Get sentences
     *
     * @param string $rawHtml Raw HTML
     *
     * @return array
     */
    public function getSentences($rawHtml)
    {
        $sentences = array();

        $rawHtmlWithoutCrLf = $this->preprocess($rawHtml);
        
        $begin = 0;
        $offset = 0;
        $matches = array();
        $isMatched = 1;

        while ($isMatched) {
            $isMatched
                = preg_match(
                    $this->fullStop,
                    $rawHtmlWithoutCrLf,
                    $matches,
                    PREG_OFFSET_CAPTURE,
                    $offset
                );
            if ($isMatched) {
                $matchedFullStop = $matches[0][0];
                $matchedPos = $matches[0][1];
                $next = $matchedPos + strlen($matchedFullStop);

                $isValid = false;
                if ("." != $matchedFullStop) {
                    $isValid = true;
                } else if ($matchedPos == strlen($rawHtmlWithoutCrLf)) {
                    $isValid = true;
                } else if ($matchedPos + 2 < strlen($rawHtmlWithoutCrLf)
                    && " " == $rawHtmlWithoutCrLf[$matchedPos + 1]
                ) {
                    $isValid = true;
                }

                if ($isValid) {
                    $extractedSentence
                        =substr(
                            $rawHtmlWithoutCrLf,
                            $begin,
                            $next - $begin
                        );

                    if($this->_preserveSpaces){
                        //Preserve white spaces at the beginning of the sentence
                        $extractedSentence=rtrim($extractedSentence);
                    } else {
                        $extractedSentence=trim($extractedSentence);
                    }
                    $sentences[]=$extractedSentence;

                    $begin = $next;
                }
                $offset = $next;
            }
        }
        
        if ($begin < strlen($rawHtmlWithoutCrLf)) {
            if($this->_preserveSpaces){
                $sentences[] = rtrim(substr($rawHtmlWithoutCrLf, $begin));
            } else {
                $sentences[] = trim(substr($rawHtmlWithoutCrLf, $begin));
            }
        }

        foreach ($sentences as &$sentence) {
            $sentence = $this->postprocess($sentence);
        }
        unset($sentence);

        return $sentences;
    }
}

abstract class AbstractSentenceSegmenter
{

    protected $_preserveSpaces;

    protected $fullStop = '/[.?!]+/u';
    protected $quoteKeys = array("\"", "'", "“", "‘", "(", "[", "{");
    protected $quotePairs
        = array(
            "\"" => "\"", "'" => "'",
            "“"  => "”", "‘" => "’",
            "("  => ")", "[" => "]", "{" => "}"
        );

    /**
     * Constructor
     */
    public function __construct($preserveSpaces=false)
    {
        $this->_preserveSpaces=$preserveSpaces;
    }

    /**
     * Add HTML tag as quotation marks
     * 
     * @param string $tagName     Name of HTML tag
     * @param string $cookedText  The text to search
     * @param array  &$quoteKeys  The reference of quoteKeys array
     * @param array  &$quotePairs The reference of quotePairs array
     * 
     * @return void
     */
    protected function addHtmlTagAsQuote(
        $tagName, $cookedText, array &$quoteKeys, array &$quotePairs
    ) {
        $matches = array();
        $isFound
            = preg_match_all(
                "|(<". $tagName ."[^>]*>)[^<]*</". $tagName .">|ius",
                $cookedText,
                $matches
            );
        if (false !== $isFound && $isFound > 0) {
            foreach ($matches[1] as $match) {
                if (!in_array($match, $quoteKeys)) {
                    $quoteKeys[] = $match;
                    $quotePairs[$match] = '</'. $tagName .'>';
                }
            }
        }
    }

    /**
     * Pre-process
     *
     * @param string $rawHtml Raw HTML
     *
     * @return string
     */
    abstract public function preprocess($rawHtml);

    /**
     * Post-process
     *
     * @param string $cookedText Cooked text
     *
     * @return string
     */
    abstract public function postprocess($cookedText);

    /**
     * Get sentences
     *
     * @param string $rawHtml Raw HTML
     *
     * @return array
     */
    abstract public function getSentences($rawHtml);
}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Home

Code snippets for concurrent download of WARC

Some old PHP code for HTML text extraction

Clone this wiki locally