From 7e712f801ebada0f66c66d65eae20f53d2f127d3 Mon Sep 17 00:00:00 2001 From: Ben Ubois Date: Tue, 12 Feb 2019 20:23:13 -0800 Subject: [PATCH 1/2] Initial support for ignoring entities. --- src/extractors/generic/content/extractor.js | 6 +++--- src/extractors/generic/index.js | 6 ++++-- src/extractors/generic/word-count/extractor.js | 2 +- src/resource/index.js | 4 ++-- src/resource/index.test.js | 10 ++++++++++ 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/extractors/generic/content/extractor.js b/src/extractors/generic/content/extractor.js index b4e57a735..6affd50a8 100644 --- a/src/extractors/generic/content/extractor.js +++ b/src/extractors/generic/content/extractor.js @@ -35,7 +35,7 @@ const GenericContentExtractor = { extract({ $, html, title, url }, opts) { opts = { ...this.defaultOpts, ...opts }; - $ = $ || cheerio.load(html); + $ = $ || cheerio.load(html, { decodeEntities: false }); // Cascade through our extraction-specific opts in an ordered fashion, // turning them off as we try to extract content. @@ -50,7 +50,7 @@ const GenericContentExtractor = { // eslint-disable-next-line no-restricted-syntax for (const key of Reflect.ownKeys(opts).filter(k => opts[k] === true)) { opts[key] = false; - $ = cheerio.load(html); + $ = cheerio.load(html, { decodeEntities: false }); node = this.getContentNode($, title, url, opts); @@ -80,7 +80,7 @@ const GenericContentExtractor = { return null; } - return normalizeSpaces($.html(node)); + return normalizeSpaces($.html(node, { decodeEntities: false })); }, }; diff --git a/src/extractors/generic/index.js b/src/extractors/generic/index.js index 3a1b8433f..76c88d0eb 100644 --- a/src/extractors/generic/index.js +++ b/src/extractors/generic/index.js @@ -32,7 +32,7 @@ const GenericExtractor = { const { html, $, contentType = 'html' } = options; if (html && !$) { - const loaded = cheerio.load(html); + const loaded = cheerio.load(html, { decodeEntities: false }); options.$ = loaded; } @@ -53,7 +53,9 @@ const GenericExtractor = { if (contentType === 'html') { convertedContent = content; } else if (contentType === 'text') { - convertedContent = $.text(cheerio.load(content)); + convertedContent = $.text( + cheerio.load(content, { decodeEntities: false }) + ); } else if (contentType === 'markdown') { const turndownService = new TurndownService(); convertedContent = turndownService.turndown(content); diff --git a/src/extractors/generic/word-count/extractor.js b/src/extractors/generic/word-count/extractor.js index d0e86c8aa..23169220b 100644 --- a/src/extractors/generic/word-count/extractor.js +++ b/src/extractors/generic/word-count/extractor.js @@ -4,7 +4,7 @@ import { normalizeSpaces } from 'utils/text'; const GenericWordCountExtractor = { extract({ content }) { - const $ = cheerio.load(content); + const $ = cheerio.load(content, { decodeEntities: false }); const $content = $('div').first(); const text = normalizeSpaces($content.text()); diff --git a/src/resource/index.js b/src/resource/index.js index e662e22f9..ada2ce4f0 100644 --- a/src/resource/index.js +++ b/src/resource/index.js @@ -63,7 +63,7 @@ const Resource = { encodeDoc({ content, contentType }) { const encoding = getEncoding(contentType); let decodedContent = iconv.decode(content, encoding); - let $ = cheerio.load(decodedContent); + let $ = cheerio.load(decodedContent, { decodeEntities: false }); // after first cheerio.load, check to see if encoding matches const metaContentType = @@ -74,7 +74,7 @@ const Resource = { // if encodings in the header/body dont match, use the one in the body if (metaContentType && properEncoding !== encoding) { decodedContent = iconv.decode(content, properEncoding); - $ = cheerio.load(decodedContent); + $ = cheerio.load(decodedContent, { decodeEntities: false }); } return $; diff --git a/src/resource/index.test.js b/src/resource/index.test.js index 4dfaa0f87..91897d588 100644 --- a/src/resource/index.test.js +++ b/src/resource/index.test.js @@ -69,6 +69,16 @@ describe('Resource', () => { assert.equal(typeof $, 'function'); }); + it('leaves entities and special characters intact', async () => { + const url = + 'https://gist.githubusercontent.com/benubois/397678cb7deeb4d91ad61d2d9b05a0fc/raw/8c5f504c6a8d18d8ab2cc119fd3481764b16ce5f/entities.html'; + const $ = await Resource.create(url); + + const result = /😀"/g; + + assert.equal(result.test($.html()), true); + }); + it('handles special encoding', async () => { const url = 'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html'; From c6732d01774066497a5b30b92e7ad4457af528c9 Mon Sep 17 00:00:00 2001 From: Ben Ubois Date: Thu, 14 Feb 2019 20:07:37 -0800 Subject: [PATCH 2/2] Decode entities for author, dek, and title. --- src/cleaners/author.js | 5 ++- src/cleaners/dek.js | 7 +++-- src/cleaners/title.js | 5 ++- src/extractors/generic/content/extractor.js | 2 +- src/utils/text/decode-entities.js | 5 +++ src/utils/text/decode-entities.test.js | 35 +++++++++++++++++++++ src/utils/text/index.js | 1 + 7 files changed, 55 insertions(+), 5 deletions(-) create mode 100644 src/utils/text/decode-entities.js create mode 100644 src/utils/text/decode-entities.test.js diff --git a/src/cleaners/author.js b/src/cleaners/author.js index 68ee0f6fc..d4e052274 100644 --- a/src/cleaners/author.js +++ b/src/cleaners/author.js @@ -1,8 +1,11 @@ -import { normalizeSpaces } from 'utils/text'; +import { decodeEntities, normalizeSpaces } from 'utils/text'; import { CLEAN_AUTHOR_RE } from './constants'; // Take an author string (like 'By David Smith ') and clean it to // just the name(s): 'David Smith'. export default function cleanAuthor(author) { + // Convert HTML encoded entities back to into characters + author = decodeEntities(author); + return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim()); } diff --git a/src/cleaners/dek.js b/src/cleaners/dek.js index 78220f2f2..f2de917fd 100644 --- a/src/cleaners/dek.js +++ b/src/cleaners/dek.js @@ -1,5 +1,5 @@ import { stripTags } from 'utils/dom'; -import { excerptContent, normalizeSpaces } from 'utils/text'; +import { excerptContent, decodeEntities, normalizeSpaces } from 'utils/text'; import { TEXT_LINK_RE } from './constants'; @@ -13,11 +13,14 @@ export default function cleanDek(dek, { $, excerpt }) { if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null; - const dekText = stripTags(dek, $); + let dekText = stripTags(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's // not a good dek - bail. if (TEXT_LINK_RE.test(dekText)) return null; + // Convert HTML encoded entities back to into characters + dekText = decodeEntities(dekText); + return normalizeSpaces(dekText.trim()); } diff --git a/src/cleaners/title.js b/src/cleaners/title.js index fe839331d..d67df0ebd 100644 --- a/src/cleaners/title.js +++ b/src/cleaners/title.js @@ -1,5 +1,5 @@ import { stripTags } from 'utils/dom'; -import { normalizeSpaces } from 'utils/text'; +import { decodeEntities, normalizeSpaces } from 'utils/text'; import { TITLE_SPLITTERS_RE } from './constants'; import { resolveSplitTitle } from './index'; @@ -21,6 +21,9 @@ export default function cleanTitle(title, { url, $ }) { } } + // Convert HTML encoded entities back to into characters + title = decodeEntities(title); + // strip any html tags in the title text return normalizeSpaces(stripTags(title, $).trim()); } diff --git a/src/extractors/generic/content/extractor.js b/src/extractors/generic/content/extractor.js index 6affd50a8..517565855 100644 --- a/src/extractors/generic/content/extractor.js +++ b/src/extractors/generic/content/extractor.js @@ -80,7 +80,7 @@ const GenericContentExtractor = { return null; } - return normalizeSpaces($.html(node, { decodeEntities: false })); + return normalizeSpaces($.html(node)); }, }; diff --git a/src/utils/text/decode-entities.js b/src/utils/text/decode-entities.js new file mode 100644 index 000000000..dab3aacca --- /dev/null +++ b/src/utils/text/decode-entities.js @@ -0,0 +1,5 @@ +import cheerio from 'cheerio'; + +export default function decodeEntities(str) { + return cheerio.load(str, { decodeEntities: true }).text(); +} diff --git a/src/utils/text/decode-entities.test.js b/src/utils/text/decode-entities.test.js new file mode 100644 index 000000000..7578964e1 --- /dev/null +++ b/src/utils/text/decode-entities.test.js @@ -0,0 +1,35 @@ +import assert from 'assert'; +import decodeEntities from './decode-entities'; + +describe('decodeEntities(str)', () => { + it('decodes html entities', () => { + const entityMap = { + '¢': `¢`, + '£': `£`, + '¥': `Â¥`, + '©': `©`, + '&': `&`, + '<': `<`, + '>': `>`, + '€': `€`, + '&': `&`, + '¢': `¢`, + '©': `©`, + '€': `€`, + '>': `>`, + '<': `<`, + '£': `£`, + '®': `®`, + '¥': `Â¥`, + }; + const entities = Object.keys(entityMap).join(' '); + const characters = Object.values(entityMap).join(' '); + + assert.equal(decodeEntities(entities), characters); + }); + + it('Leaves non-ascii alone', () => { + const str = 'å¾· 😀 Ä›'; + assert.equal(decodeEntities(str), str); + }); +}); diff --git a/src/utils/text/index.js b/src/utils/text/index.js index 8ed8418ce..a416a25fa 100644 --- a/src/utils/text/index.js +++ b/src/utils/text/index.js @@ -6,3 +6,4 @@ export { default as articleBaseUrl } from './article-base-url'; export { default as hasSentenceEnd } from './has-sentence-end'; export { default as excerptContent } from './excerpt-content'; export { default as getEncoding } from './get-encoding'; +export { default as decodeEntities } from './decode-entities';