库不支持python3 #2

hurricanetx · 2017-05-25T11:58:19Z

model.py 修改为

`
#!/bin/env python
#encoding=utf-8
import re
import lxml
import lxml.html
import urllib
from .tags_util import clean_tags_only, clean_tags_hasprop, clean_tags_exactly, clean_tags
from .region import Region

class PageModel(object):
def init(self, page, url = ""):
assert type(page) is str
for tag in ['style','script']:
page = clean_tags(page, tag)
page = clean_tags_hasprop(page, "div", "(display:.?none|comment|measure)")
page = clean_tags_only(page, "(span|section|font|em)")
self.doc = lxml.html.fromstring(page)
self.url = url
self.region = Region(self.doc)
self.impurity_threshold = 30
self.anchor_ratio_limit = 0.3
self.stripper = re.compile(r'\s+')

def extract_content(self, region):
    items = region.xpath('.//text()|.//img|./table')
    tag_hist = {}
    for item in items:
        if  hasattr(item,'tag'):
            continue
        t = item.getparent().tag
        if t not in tag_hist:
            tag_hist[t] = 0
        tag_hist[t] += len(item.strip())
    winner_tag = None
    if len(tag_hist) > 0:
        winner_tag = max((c,k) for k,c in tag_hist.items())[1]
    contents = []
    for item in items:
        if not hasattr(item,'tag'):
            txt = item.strip()
            parent_tag = item.getparent().tag
            if  parent_tag != winner_tag \
                and len(self.stripper.sub("",txt)) < self.impurity_threshold \
                and parent_tag != 'li':
                continue
            contents.append({"type":"text","data":txt})
        elif item.tag == 'table':
            if winner_tag == 'td':
                continue
            if item != region:
                for el in item.xpath(".//a"):
                    el.drop_tag()
                table_s = lxml.html.tostring(item)
                contents.append({"type":"html","data":table_s})
            else:
                for sub_item in item.xpath("//td/text()"):
                    contents.append({"type":"text","data":sub_item})
        elif item.tag == 'img':
            for img_prop in ('original', 'file', 'data-original', 'src-info', 'data-src', 'src'):
                src =  item.get(img_prop)
                if src != None:
                    break
            if self.url != "":
                if not src.startswith("/") and not src.startswith("http") and not src.startswith("./"):
                    src = "/" + src
                src = urlparse.urljoin(self.url, src, False)
            contents.append({"type":"image","data":{"src": src}})    
        else:
            pass   
    return contents

def extract_title(self):
    doc = self.doc
    tag_title = doc.xpath("/html/head/title/text()")
    s_tag_title = "".join(re.split(r'_|-',"".join(tag_title))[:1])
    title_candidates = doc.xpath('//h1/text()|//h2/text()|//h3/text()|//p[@class="title"]/text()')
    for c_title in title_candidates:
        c_title = c_title.strip()
        if c_title!="" and (s_tag_title.startswith(c_title) or s_tag_title.endswith(c_title)):
            return c_title
    sort_by_len_list = sorted((-1*len(x.strip()),x) for x in ([s_tag_title] + title_candidates))
    return sort_by_len_list[0][1]

def extract(self):
    title = self.extract_title()
    region = self.region.locate()
    if region == None:
        return {'title':'', 'content':[]}
    rm_tag_set = set([])
    for p_el in region.xpath(".//p|.//li"):
        child_links = p_el.xpath(".//a/text()")
        count_p = len(" ".join(p_el.xpath(".//text()")))
        count_a = len(" ".join(child_links))
        if float(count_a) / (count_p + 1.0) > self.anchor_ratio_limit:
            p_el.drop_tree()
    for el in region.xpath(".//a"):
        rm_tag_set.add(el)
    for el in region.xpath(".//strong|//b"):
        rm_tag_set.add(el)
    for el in rm_tag_set:
        el.drop_tag()
    content = self.extract_content(region)
    return {"title":title , "content": content}

`

The text was updated successfully, but these errors were encountered:

fxsjy · 2017-05-25T13:14:14Z

求Pull Request

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

库不支持python3 #2

库不支持python3 #2

hurricanetx commented May 25, 2017

fxsjy commented May 25, 2017

库不支持python3 #2

库不支持python3 #2

Comments

hurricanetx commented May 25, 2017

fxsjy commented May 25, 2017