Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Ignore entities. #273

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/cleaners/author.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import { normalizeSpaces } from 'utils/text';
import { decodeEntities, normalizeSpaces } from 'utils/text';
import { CLEAN_AUTHOR_RE } from './constants';

// Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'.
export default function cleanAuthor(author) {
// Convert HTML encoded entities back to into characters
author = decodeEntities(author);

return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
}
7 changes: 5 additions & 2 deletions src/cleaners/dek.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { stripTags } from 'utils/dom';
import { excerptContent, normalizeSpaces } from 'utils/text';
import { excerptContent, decodeEntities, normalizeSpaces } from 'utils/text';

import { TEXT_LINK_RE } from './constants';

Expand All @@ -13,11 +13,14 @@ export default function cleanDek(dek, { $, excerpt }) {
if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10))
return null;

const dekText = stripTags(dek, $);
let dekText = stripTags(dek, $);

// Plain text links shouldn't exist in the dek. If we have some, it's
// not a good dek - bail.
if (TEXT_LINK_RE.test(dekText)) return null;

// Convert HTML encoded entities back to into characters
dekText = decodeEntities(dekText);

return normalizeSpaces(dekText.trim());
}
5 changes: 4 additions & 1 deletion src/cleaners/title.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { stripTags } from 'utils/dom';
import { normalizeSpaces } from 'utils/text';
import { decodeEntities, normalizeSpaces } from 'utils/text';

import { TITLE_SPLITTERS_RE } from './constants';
import { resolveSplitTitle } from './index';
Expand All @@ -21,6 +21,9 @@ export default function cleanTitle(title, { url, $ }) {
}
}

// Convert HTML encoded entities back to into characters
title = decodeEntities(title);

// strip any html tags in the title text
return normalizeSpaces(stripTags(title, $).trim());
}
4 changes: 2 additions & 2 deletions src/extractors/generic/content/extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ const GenericContentExtractor = {
extract({ $, html, title, url }, opts) {
opts = { ...this.defaultOpts, ...opts };

$ = $ || cheerio.load(html);
$ = $ || cheerio.load(html, { decodeEntities: false });

// Cascade through our extraction-specific opts in an ordered fashion,
// turning them off as we try to extract content.
Expand All @@ -50,7 +50,7 @@ const GenericContentExtractor = {
// eslint-disable-next-line no-restricted-syntax
for (const key of Reflect.ownKeys(opts).filter(k => opts[k] === true)) {
opts[key] = false;
$ = cheerio.load(html);
$ = cheerio.load(html, { decodeEntities: false });

node = this.getContentNode($, title, url, opts);

Expand Down
6 changes: 4 additions & 2 deletions src/extractors/generic/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const GenericExtractor = {
const { html, $, contentType = 'html' } = options;

if (html && !$) {
const loaded = cheerio.load(html);
const loaded = cheerio.load(html, { decodeEntities: false });
options.$ = loaded;
}

Expand All @@ -53,7 +53,9 @@ const GenericExtractor = {
if (contentType === 'html') {
convertedContent = content;
} else if (contentType === 'text') {
convertedContent = $.text(cheerio.load(content));
convertedContent = $.text(
cheerio.load(content, { decodeEntities: false })
);
} else if (contentType === 'markdown') {
const turndownService = new TurndownService();
convertedContent = turndownService.turndown(content);
Expand Down
2 changes: 1 addition & 1 deletion src/extractors/generic/word-count/extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { normalizeSpaces } from 'utils/text';

const GenericWordCountExtractor = {
extract({ content }) {
const $ = cheerio.load(content);
const $ = cheerio.load(content, { decodeEntities: false });
const $content = $('div').first();

const text = normalizeSpaces($content.text());
Expand Down
4 changes: 2 additions & 2 deletions src/resource/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ const Resource = {
encodeDoc({ content, contentType }) {
const encoding = getEncoding(contentType);
let decodedContent = iconv.decode(content, encoding);
let $ = cheerio.load(decodedContent);
let $ = cheerio.load(decodedContent, { decodeEntities: false });

// after first cheerio.load, check to see if encoding matches
const metaContentType =
Expand All @@ -74,7 +74,7 @@ const Resource = {
// if encodings in the header/body dont match, use the one in the body
if (metaContentType && properEncoding !== encoding) {
decodedContent = iconv.decode(content, properEncoding);
$ = cheerio.load(decodedContent);
$ = cheerio.load(decodedContent, { decodeEntities: false });
}

return $;
Expand Down
10 changes: 10 additions & 0 deletions src/resource/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ describe('Resource', () => {
assert.equal(typeof $, 'function');
});

it('leaves entities and special characters intact', async () => {
const url =
'https://gist.githubusercontent.com/benubois/397678cb7deeb4d91ad61d2d9b05a0fc/raw/8c5f504c6a8d18d8ab2cc119fd3481764b16ce5f/entities.html';
const $ = await Resource.create(url);

const result = /😀"/g;

assert.equal(result.test($.html()), true);
});

it('handles special encoding', async () => {
const url =
'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
Expand Down
5 changes: 5 additions & 0 deletions src/utils/text/decode-entities.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import cheerio from 'cheerio';

export default function decodeEntities(str) {
return cheerio.load(str, { decodeEntities: true }).text();
}
35 changes: 35 additions & 0 deletions src/utils/text/decode-entities.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import assert from 'assert';
import decodeEntities from './decode-entities';

describe('decodeEntities(str)', () => {
it('decodes html entities', () => {
const entityMap = {
'¢': `¢`,
'£': `£`,
'¥': `¥`,
'©': `©`,
'&': `&`,
'&#60;': `<`,
'&#62;': `>`,
'&#8364;': `€`,
'&amp;': `&`,
'&cent;': `¢`,
'&copy;': `©`,
'&euro;': `€`,
'&gt;': `>`,
'&lt;': `<`,
'&pound;': `£`,
'&reg;': `®`,
'&yen;': `¥`,
};
const entities = Object.keys(entityMap).join(' ');
const characters = Object.values(entityMap).join(' ');

assert.equal(decodeEntities(entities), characters);
});

it('Leaves non-ascii alone', () => {
const str = '德 😀 ě';
assert.equal(decodeEntities(str), str);
});
});
1 change: 1 addition & 0 deletions src/utils/text/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ export { default as articleBaseUrl } from './article-base-url';
export { default as hasSentenceEnd } from './has-sentence-end';
export { default as excerptContent } from './excerpt-content';
export { default as getEncoding } from './get-encoding';
export { default as decodeEntities } from './decode-entities';