Skip to content

Commit

Permalink
chore: refactored and linted
Browse files Browse the repository at this point in the history
  • Loading branch information
adampash committed Sep 13, 2016
1 parent 9906bd3 commit 7e2a349
Show file tree
Hide file tree
Showing 193 changed files with 4,177 additions and 4,315 deletions.
1 change: 1 addition & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
**/fixtures/*
39 changes: 39 additions & 0 deletions .eslintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Use this file as a starting point for your project's .eslintrc.
// Copy this file, and add rule overrides as needed.
{
"parser": "babel-eslint",
"extends": "airbnb",
"plugins": [
"babel"
],
"globals": {
/* mocha */
"describe",
"it"
},
"rules": {
"no-param-reassign": 0,
/* TODO fix this; this should work w/import/resolver below, but doesn't */
"import/no-extraneous-dependencies": 0,
"import/no-unresolved": 0,
"no-control-regex": 0,
"import/prefer-default-export": 0,
"generator-star-spacing": 0,
"babel/generator-star-spacing": 0,
"func-names": 0,
"no-useless-escape": 0,
"no-confusing-arrow": 0,
},
"settings": {
"import/resolver": {
"babel-module": {
"extensions": [".js"]
}
}
},
"parserOptions":{
"ecmaFeatures": {
"experimentalObjectRestSpread": true
}
}
}
13 changes: 12 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@
"main": "index.js",
"scripts": {
"start": "node ./build",
"build": "rollup -c",
"lint": "eslint src/**",
"build": "eslint src/** && rollup -c",
"test": "./test-runner"
},
"author": "",
"license": "ISC",
"devDependencies": {
"babel-eslint": "^6.1.2",
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-module-alias": "^1.6.0",
"babel-plugin-module-resolver": "^2.2.0",
"babel-plugin-transform-async-to-generator": "^6.8.0",
"babel-plugin-transform-es2015-destructuring": "^6.9.0",
"babel-plugin-transform-object-rest-spread": "^6.8.0",
Expand All @@ -21,6 +24,14 @@
"babel-preset-es2015-rollup": "^1.2.0",
"babel-register": "^6.11.6",
"babelrc-rollup": "^3.0.0",
"eslint": "^3.5.0",
"eslint-config-airbnb": "^11.1.0",
"eslint-import-resolver-babel-module": "^2.0.1",
"eslint-plugin-async": "^0.1.1",
"eslint-plugin-babel": "^3.3.0",
"eslint-plugin-import": "^1.15.0",
"eslint-plugin-jsx-a11y": "^2.2.2",
"eslint-plugin-react": "^6.2.1",
"mocha": "^3.0.2",
"rollup": "^0.34.13",
"rollup-plugin-babel": "^2.6.1",
Expand Down
21 changes: 21 additions & 0 deletions score-move
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/local/bin/fish

set file $argv[1]
set function $argv[2]

touch src/extractors/generic/next-page-url/scoring/utils/index.js
touch src/extractors/generic/next-page-url/scoring/utils/$file.js
touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js

echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js

echo "Now make it a default export"
echo "Move it to its file"
echo "Move its tests to its test file"
echo "import in score-links"
echo "Test it."

4 changes: 2 additions & 2 deletions src/cleaners/author.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { CLEAN_AUTHOR_RE } from './constants'
import { CLEAN_AUTHOR_RE } from './constants';

// Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'.
export default function cleanAuthor(author) {
return author.replace(CLEAN_AUTHOR_RE, '$2').trim()
return author.replace(CLEAN_AUTHOR_RE, '$2').trim();
}
20 changes: 10 additions & 10 deletions src/cleaners/author.test.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
import assert from 'assert'
import assert from 'assert';

import cleanAuthor from './author'
import cleanAuthor from './author';

describe('cleanAuthor(author)', () => {
it('removes the By from an author string', () => {
const author = cleanAuthor('By Bob Dylan')
const author = cleanAuthor('By Bob Dylan');

assert.equal(author, 'Bob Dylan')
})
assert.equal(author, 'Bob Dylan');
});

it('trims trailing whitespace and line breaks', () => {
const text = `
written by
Bob Dylan
`
const author = cleanAuthor(text)
`;
const author = cleanAuthor(text);

assert.equal(author, 'Bob Dylan')
})
})
assert.equal(author, 'Bob Dylan');
});
});
38 changes: 28 additions & 10 deletions src/cleaners/constants.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// CLEAN AUTHOR CONSTANTS
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',

// CLEAN DEK CONSTANTS
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
// An ordered list of meta tag names that denote likely article deks.
// From most distinct to least distinct.
//
Expand All @@ -14,7 +14,7 @@ export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
// However, these tags often have SEO-specific junk in them that's not
// header-worthy like a dek is. Excerpt material at best.
export const DEK_META_TAGS = [
]
];

// An ordered list of Selectors to find likely article deks. From
// most explicit to least explicit.
Expand All @@ -23,18 +23,36 @@ export const DEK_META_TAGS = [
// detrimental to the aesthetics of an article.
export const DEK_SELECTORS = [
'.entry-summary',
]
];

// CLEAN DATE PUBLISHED CONSTANTS
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i
export const SPLIT_DATE_STRING = /(\d{1,2}:\d{2,2}(\s?[ap]\.?m\.?)?)|(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})|(\d{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/ig
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i;
export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i;
export const TIME_MERIDIAN_DOTS_RE = /\.m\./i;
const months = [
'jan',
'feb',
'mar',
'apr',
'may',
'jun',
'jul',
'aug',
'sep',
'oct',
'nov',
'dec',
];
const allMonths = months.join('|');
const timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';
const timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';
export const SPLIT_DATE_STRING =
new RegExp(`(${timestamp1})|(${timestamp2})|([0-9]{1,4})|(${allMonths})`, 'ig');

// CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g;

export const DOMAIN_ENDINGS_RE =
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g')
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g');
30 changes: 14 additions & 16 deletions src/cleaners/content.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,54 +8,52 @@ import {
rewriteTopLevel,
stripJunkTags,
makeLinksAbsolute,
} from 'utils/dom'

import { convertNodeTo } from 'utils/dom'
} from 'utils/dom';

// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(
article,
{
$,
cleanConditionally=true,
title='',
url='',
cleanConditionally = true,
title = '',
url = '',
}
) {
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
rewriteTopLevel(article, $)
rewriteTopLevel(article, $);

// Drop small images and spacer images
cleanImages(article, $)
cleanImages(article, $);

// Drop certain tags like <title>, etc
// This is -mostly- for cleanliness, not security.
stripJunkTags(article, $)
stripJunkTags(article, $);

// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
cleanHOnes(article, $)
cleanHOnes(article, $);

// Clean headers
cleanHeaders(article, $, title)
cleanHeaders(article, $, title);

// Make links absolute
makeLinksAbsolute(article, $, url)
makeLinksAbsolute(article, $, url);

// Remove style or align attributes
cleanAttributes(article, $)
cleanAttributes(article);

// We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
cleanTags(article, $, cleanConditionally)
cleanTags(article, $, cleanConditionally);

// Remove empty paragraph nodes
removeEmpty(article, $)
removeEmpty(article, $);

return article
return article;
}
// headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6')
// for header in headers:
Expand Down
48 changes: 24 additions & 24 deletions src/cleaners/content.test.js
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';

import extractCleanNode from './content'
import extractBestNode from 'extractors/generic/content/extract-best-node'
import extractBestNode from 'extractors/generic/content/extract-best-node';
import extractCleanNode from './content';

describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it("cleans cruft out of a DOM node", () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
let $ = cheerio.load(html)
it('cleans cruft out of a DOM node', () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8');
const $ = cheerio.load(html);

const opts = {
stripUnlikelyCandidates: true,
weightNodes: true,
cleanConditionally: true,
}
stripUnlikelyCandidates: true,
weightNodes: true,
cleanConditionally: true,
};

const bestNode = extractBestNode($, opts)
let result = $.html(bestNode)
// console.log(result)
// console.log(result.length)
const cleanNode = extractCleanNode(bestNode, { $, opts })
result = $.html(cleanNode)
// console.log(result.length)
// console.log(result)
// console.log(bestNode.html())
const bestNode = extractBestNode($, opts);
// let result = $.html(bestNode);
// // console.log(result)
// // console.log(result.length)
const cleanNode = extractCleanNode(bestNode, { $, opts });
// result = $.html(cleanNode);
// // console.log(result.length)
// // console.log(result)
// // console.log(bestNode.html())

assert.equal($(bestNode).text().length, 2687)
})
})
assert.equal($(cleanNode).text().length, 2687);
});
});

32 changes: 16 additions & 16 deletions src/cleaners/date-published.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import moment from 'moment'
import moment from 'moment';
// Is there a compelling reason to use moment here?
// Mostly only being used for the isValid() method,
// but could just check for 'Invalid Date' string.
Expand All @@ -7,27 +7,27 @@ import {
CLEAN_DATE_STRING_RE,
SPLIT_DATE_STRING,
TIME_MERIDIAN_SPACE_RE,
TIME_MERIDIAN_DOTS_RE
} from './constants'
TIME_MERIDIAN_DOTS_RE,
} from './constants';

export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim();
}

// Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
export default function cleanDatePublished(dateString) {
let date = moment(new Date(dateString))
let date = moment(new Date(dateString));

if (!date.isValid()) {
dateString = cleanDateString(dateString)
date = moment(new Date(dateString))
dateString = cleanDateString(dateString);
date = moment(new Date(dateString));
}

return date.isValid() ? date.toISOString() : null
}

export function cleanDateString(dateString) {
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(TIME_MERIDIAN_DOTS_RE, 'm')
.replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3')
.replace(CLEAN_DATE_STRING_RE, '$1')
.trim()
return date.isValid() ? date.toISOString() : null;
}
Loading

0 comments on commit 7e2a349

Please sign in to comment.