postlight · benubois · Feb 13, 2019 · Feb 15, 2019
diff --git a/src/cleaners/author.js b/src/cleaners/author.js
@@ -1,8 +1,11 @@
-import { normalizeSpaces } from 'utils/text';
+import { decodeEntities, normalizeSpaces } from 'utils/text';
 import { CLEAN_AUTHOR_RE } from './constants';
 
 // Take an author string (like 'By David Smith ') and clean it to
 // just the name(s): 'David Smith'.
 export default function cleanAuthor(author) {
+  // Convert HTML encoded entities back to into characters
+  author = decodeEntities(author);
+
   return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
 }
diff --git a/src/cleaners/dek.js b/src/cleaners/dek.js
@@ -1,5 +1,5 @@
 import { stripTags } from 'utils/dom';
-import { excerptContent, normalizeSpaces } from 'utils/text';
+import { excerptContent, decodeEntities, normalizeSpaces } from 'utils/text';
 
 import { TEXT_LINK_RE } from './constants';
 
@@ -13,11 +13,14 @@ export default function cleanDek(dek, { $, excerpt }) {
   if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10))
     return null;
 
-  const dekText = stripTags(dek, $);
+  let dekText = stripTags(dek, $);
 
   // Plain text links shouldn't exist in the dek. If we have some, it's
   // not a good dek - bail.
   if (TEXT_LINK_RE.test(dekText)) return null;
 
+  // Convert HTML encoded entities back to into characters
+  dekText = decodeEntities(dekText);
+
   return normalizeSpaces(dekText.trim());
 }
diff --git a/src/cleaners/title.js b/src/cleaners/title.js
@@ -1,5 +1,5 @@
 import { stripTags } from 'utils/dom';
-import { normalizeSpaces } from 'utils/text';
+import { decodeEntities, normalizeSpaces } from 'utils/text';
 
 import { TITLE_SPLITTERS_RE } from './constants';
 import { resolveSplitTitle } from './index';
@@ -21,6 +21,9 @@ export default function cleanTitle(title, { url, $ }) {
     }
   }
 
+  // Convert HTML encoded entities back to into characters
+  title = decodeEntities(title);
+
   // strip any html tags in the title text
   return normalizeSpaces(stripTags(title, $).trim());
 }
diff --git a/src/extractors/generic/content/extractor.js b/src/extractors/generic/content/extractor.js
@@ -35,7 +35,7 @@ const GenericContentExtractor = {
   extract({ $, html, title, url }, opts) {
     opts = { ...this.defaultOpts, ...opts };
 
-    $ = $ || cheerio.load(html);
+    $ = $ || cheerio.load(html, { decodeEntities: false });
 
     // Cascade through our extraction-specific opts in an ordered fashion,
     // turning them off as we try to extract content.
@@ -50,7 +50,7 @@ const GenericContentExtractor = {
     // eslint-disable-next-line no-restricted-syntax
     for (const key of Reflect.ownKeys(opts).filter(k => opts[k] === true)) {
       opts[key] = false;
-      $ = cheerio.load(html);
+      $ = cheerio.load(html, { decodeEntities: false });
 
       node = this.getContentNode($, title, url, opts);
 

diff --git a/src/extractors/generic/index.js b/src/extractors/generic/index.js
@@ -32,7 +32,7 @@ const GenericExtractor = {
     const { html, $, contentType = 'html' } = options;
 
     if (html && !$) {
-      const loaded = cheerio.load(html);
+      const loaded = cheerio.load(html, { decodeEntities: false });
       options.$ = loaded;
     }
 
@@ -53,7 +53,9 @@ const GenericExtractor = {
     if (contentType === 'html') {
       convertedContent = content;
     } else if (contentType === 'text') {
-      convertedContent = $.text(cheerio.load(content));
+      convertedContent = $.text(
+        cheerio.load(content, { decodeEntities: false })
+      );
     } else if (contentType === 'markdown') {
       const turndownService = new TurndownService();
       convertedContent = turndownService.turndown(content);

diff --git a/src/extractors/generic/word-count/extractor.js b/src/extractors/generic/word-count/extractor.js
@@ -4,7 +4,7 @@ import { normalizeSpaces } from 'utils/text';
 
 const GenericWordCountExtractor = {
   extract({ content }) {
-    const $ = cheerio.load(content);
+    const $ = cheerio.load(content, { decodeEntities: false });
     const $content = $('div').first();
 
     const text = normalizeSpaces($content.text());

diff --git a/src/resource/index.js b/src/resource/index.js
@@ -63,7 +63,7 @@ const Resource = {
   encodeDoc({ content, contentType }) {
     const encoding = getEncoding(contentType);
     let decodedContent = iconv.decode(content, encoding);
-    let $ = cheerio.load(decodedContent);
+    let $ = cheerio.load(decodedContent, { decodeEntities: false });
 
     // after first cheerio.load, check to see if encoding matches
     const metaContentType =
@@ -74,7 +74,7 @@ const Resource = {
     // if encodings in the header/body dont match, use the one in the body
     if (metaContentType && properEncoding !== encoding) {
       decodedContent = iconv.decode(content, properEncoding);
-      $ = cheerio.load(decodedContent);
+      $ = cheerio.load(decodedContent, { decodeEntities: false });
     }
 
     return $;

diff --git a/src/resource/index.test.js b/src/resource/index.test.js
@@ -69,6 +69,16 @@ describe('Resource', () => {
       assert.equal(typeof $, 'function');
     });
 
+    it('leaves entities and special characters intact', async () => {
+      const url =
+        'https://gist.githubusercontent.com/benubois/397678cb7deeb4d91ad61d2d9b05a0fc/raw/8c5f504c6a8d18d8ab2cc119fd3481764b16ce5f/entities.html';
+      const $ = await Resource.create(url);
+
+      const result = /😀&quot;/g;
+
+      assert.equal(result.test($.html()), true);
+    });
+
     it('handles special encoding', async () => {
       const url =
         'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';

diff --git a/src/utils/text/decode-entities.js b/src/utils/text/decode-entities.js
@@ -0,0 +1,5 @@
+import cheerio from 'cheerio';
+
+export default function decodeEntities(str) {
+  return cheerio.load(str, { decodeEntities: true }).text();
+}
diff --git a/src/utils/text/decode-entities.test.js b/src/utils/text/decode-entities.test.js
@@ -0,0 +1,35 @@
+import assert from 'assert';
+import decodeEntities from './decode-entities';
+
+describe('decodeEntities(str)', () => {
+  it('decodes html entities', () => {
+    const entityMap = {
+      '&#162;': `¢`,
+      '&#163;': `£`,
+      '&#165;': `¥`,
+      '&#169;': `©`,
+      '&#38;': `&`,
+      '&#60;': `<`,
+      '&#62;': `>`,
+      '&#8364;': `€`,
+      '&amp;': `&`,
+      '&cent;': `¢`,
+      '&copy;': `©`,
+      '&euro;': `€`,
+      '&gt;': `>`,
+      '&lt;': `<`,
+      '&pound;': `£`,
+      '&reg;': `®`,
+      '&yen;': `¥`,
+    };
+    const entities = Object.keys(entityMap).join(' ');
+    const characters = Object.values(entityMap).join(' ');
+
+    assert.equal(decodeEntities(entities), characters);
+  });
+
+  it('Leaves non-ascii alone', () => {
+    const str = '德 😀 ě';
+    assert.equal(decodeEntities(str), str);
+  });
+});
diff --git a/src/utils/text/index.js b/src/utils/text/index.js
@@ -6,3 +6,4 @@ export { default as articleBaseUrl } from './article-base-url';
 export { default as hasSentenceEnd } from './has-sentence-end';
 export { default as excerptContent } from './excerpt-content';
 export { default as getEncoding } from './get-encoding';
+export { default as decodeEntities } from './decode-entities';