-
Notifications
You must be signed in to change notification settings - Fork 106
/
extract_article_content_from_HTML.py
107 lines (92 loc) · 4.58 KB
/
extract_article_content_from_HTML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Be sure to install boilerpipe (pip install boilerpipe)
"""
Extract main article content from HTML with Boilerpipe
In this script, we define a function that takes a single HTML news article as
input and returns the primary textual content of the article. We create an
SFrame from a sample of HTML articles, and then we pass the function defined
above to the apply method on the column in our SFrame containing the HTML
source. The clean content is included in our SFrame as a new column. We use the
Python boilerpipe module to extract the clean text.
"""
import graphlab as gl
import os
import requests
from StringIO import StringIO
from urllib import urlopen
from zipfile import ZipFile
def extract_article_content(src_content):
"""
Extract the primary textual content from an HTML news article.
In many cases, the HTML source of news articles is littered with boilerplate
text that you would not want to include when doing text analysis on the
content the page. Even if you could write some rules to extract the content
from one page, it's unlikely that those rules would apply to an article from
another site. The boilerpipe module allows us to solve this problem more
generally.
Parameters
----------
src_conent : str
The source HTML from which to extract the content.
Returns
-------
out : str
The primary content of the page with all HTML and boilerplate text
removed.
Examples
--------
>>> extract_article_content(
"<html><body><p>Turi is in the business of building the best " \
"machine learning platform on the planet. Our goal is to make it " \
"easy for data scientists to build intelligent, predictive " \
"applications quickly and at scale. Given the perplexing array " \
"of tools in this space, we often get asked "Why Turi? What " \
"differentiates it from tools X, Y, and Z?" This blog post aims " \
"to provide some answers. I’ll go into some technical details " \
"about the challenges of building a predictive application, and " \
"how Turi’s ML platform can help.</p></body></html>")
See Also
--------
- `Boilerpipe project <https://code.google.com/p/boilerpipe/>`_
- `Boilerpipe Python module <https://pypi.python.org/pypi/boilerpipe>`_
"""
import logging
from boilerpipe.extract import Extractor
if src_content and src_content.strip():
try:
extractor = Extractor(extractor='ArticleExtractor', html=src_content)
return extractor.getText()
except Exception as e:
logging.getLogger().error("Exception (%s) extracting from (%s)" % \
(e.message, src_content[0:25]))
# Get the sample source data
source_s3_path = "https://static.turi.com/datasets/how-to/html_source_sample.zip"
# Create SArray of filenames and read content of each file into SFrame
with ZipFile(StringIO(urlopen(source_s3_path).read())) as z:
source_sa = gl.SArray([f for f in z.namelist() if not os.path.isdir(f) and f.endswith(".html")])
dataset = gl.SFrame({"source": source_sa.apply(lambda f: z.open(f).read())})
# Extract clean text for each HTML file and create new column caled "content"
dataset["content"] = dataset["source"].apply(
lambda src: (extract_article_content(src) or None) if src else None)
# Columns:
# source str
# content str
# Rows: 1000
# Data:
# +-------------------------------+-------------------------------+
# | source | content |
# +-------------------------------+-------------------------------+
# | <!DOCTYPE html PUBLIC "-//... | My good friend from colleg... |
# | <!DOCTYPE html PUBLIC "nul... | Honda "Hackura": un auto, ... |
# | <!DOCTYPE html>\n\n\n<!--[... | None |
# | <!DOCTYPE html PUBLIC "-//... | The Monad is like a bellow... |
# | <!DOCTYPE html>\n<html lan... | Need help cloning? Visit B... |
# | <!DOCTYPE html>\n\n\n<!--[... | None |
# | <!DOCTYPE html>\r\n<!--[if... | Should I Become an Entrepr... |
# | <!DOCTYPE HTML PUBLIC "-//... | Posted by Mikko @ 11:51 GM... |
# | <HTML>\n\n<head>\n<!-- red... | Why Did Human History Unfo... |
# | \n<!-- Our standard header... | Remember selection for thi... |
# | ... | ... |
# +-------------------------------+-------------------------------+
# [1000 rows x 2 columns]
# Note: Only the head of the SFrame is printed.
# You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.