Skip to content

Commit

Permalink
Require Node.js 12 and move to ESM
Browse files Browse the repository at this point in the history
Signed-off-by: Richie Bendall <[email protected]>
  • Loading branch information
Richienb committed Aug 6, 2021
1 parent b8446ff commit 416ab0b
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 111 deletions.
7 changes: 3 additions & 4 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# editorconfig.org

root = true

[*]
Expand All @@ -9,5 +7,6 @@ trim_trailing_whitespace = true
insert_final_newline = true
indent_style = tab

[*.md]
trim_trailing_whitespace = false
[*.yml]
indent_style = space
indent_size = 2
2 changes: 1 addition & 1 deletion LICENSE → license
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2019 Richie Bendall
Copyright (c) 2019 - 2021 Richie Bendall

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
59 changes: 22 additions & 37 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
"repository": "https://github.com/Richienb/fetch-charset-detection.git",
"author": "Richie Bendall <[email protected]>",
"license": "MIT",
"main": "index.js",
"type": "module",
"exports": "./dist/index.js",
"files": [
"index.js",
"index.d.ts"
"dist"
],
"engines": {
"node": ">=10"
"node": ">=12"
},
"scripts": {
"build": "tsc",
Expand All @@ -29,44 +29,29 @@
"node-fetch"
],
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"cheerio": "^1.0.0-rc.10",
"content-type": "^1.0.4",
"iconv-lite": "^0.6.2"
"iconv-lite": "^0.6.3"
},
"devDependencies": {
"@richienb/tsconfig": "^0.1.1",
"@types/cheerio": "^0.22.21",
"@types/content-type": "^1.1.3",
"@types/nice-try": "^2.0.0",
"@typescript-eslint/eslint-plugin": "^4.1.1",
"@typescript-eslint/parser": "^4.1.1",
"ava": "^3.12.1",
"eslint-config-richienb": "^0.4.2",
"eslint-config-xo-typescript": "^0.32.0",
"node-fetch": "^2.6.1",
"ts-node": "^9.0.0",
"xo": "^0.33.1"
},
"xo": {
"extends": "richienb",
"rules": {
"node/no-missing-import": 0
},
"overrides": [
{
"files": "test.ts",
"rules": {
"@typescript-eslint/no-unsafe-call": 0
}
}
]
"@richienb/tsconfig": "^0.3.0",
"@types/cheerio": "^0.22.30",
"@types/content-type": "^1.1.5",
"ava": "^3.15.0",
"node-fetch": "3.0.0-beta.10",
"ts-node": "^10.1.0",
"typescript": "^4.3.5",
"xo": "^0.43.0"
},
"ava": {
"extensions": [
"ts"
],
"require": [
"ts-node/register"
"extensions": {
"ts": "module"
},
"nonSemVerExperiments": {
"configurableModuleFormat": true
},
"nodeArguments": [
"--loader=ts-node/esm"
]
}
}
6 changes: 3 additions & 3 deletions README.md → readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ npm install fetch-charset-detection
## Usage

```js
const convertBody = require("fetch-charset-detection")
import convertBody from 'fetch-charset-detection';

convertBody(content)
convertBody(content);
```

## API
Expand All @@ -22,7 +22,7 @@ convertBody(content)

#### content

Type: `Buffer`
Type: [`Buffer`](https://nodejs.org/api/buffer.html#buffer_class_buffer)

The content to stringify.

Expand Down
23 changes: 15 additions & 8 deletions source/index.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import getCharset from "./utils/get-charset"
import { decode } from "iconv-lite"
import type {Buffer} from 'node:buffer';
import iconv from 'iconv-lite';
import getCharset from './utils/get-charset.js';

/**
Detect the encoding of a buffer and stringify it.
@param content The content to stringify.
@param headers The HTTP headers provided with the content.
@example
```
import convertBody from 'fetch-charset-detection';
convertBody(content);
```
*/
function convertBody(content: Buffer, headers?: Headers): string {
export default function convertBody(content: Buffer, headers?: Headers): string {
// Turn raw buffers into a single utf-8 buffer
return decode(
return iconv.decode(
content,
getCharset(content, headers)
)
getCharset(content, headers),
);
}

export = convertBody
34 changes: 17 additions & 17 deletions source/utils/get-charset.ts
Original file line number Diff line number Diff line change
@@ -1,43 +1,43 @@
import { load } from "cheerio"
import parseContentType from "./parse-content-type"
import type {Buffer} from 'node:buffer';
import {load} from 'cheerio';
import parseContentType from './parse-content-type.js';

/**
Get the charset of `content`.
@param content The content to stringify.
@param headers HTTP Headers provided with the request.
*/
function getCharset(content: Buffer, headers?: Headers) {
export default function getCharset(content: Buffer, headers?: Headers) {

This comment has been minimized.

Copy link
@jimmywarting

jimmywarting Aug 6, 2021

b/c node-fetch v4 might remove body.buffer() it would feel more convenient if it could accept other or more things such as await res.arrayBuffer()

Should actually be handled more downstreams in incov-lite but it looks like they still target very old nodejs versions still

This comment has been minimized.

Copy link
@Richienb

Richienb Aug 10, 2021

Author Member

Done in 0.7.0

// See http://www.w3.org/TR/2011/WD-html5-20110113/parsing.html#determining-the-character-encoding
// Resulting charset
let charset: string
let charset: string;

// Try to extract content-type header
const contentType = headers?.get("content-type")
const contentType = headers?.get('content-type');
if (contentType) {
charset = parseContentType(contentType)
charset = parseContentType(contentType);
}

// No charset in content type, peek at response body for at most 1024 bytes
const data = content.slice(0, 1024).toString()
const data = content.slice(0, 1024).toString();

// HTML5, HTML4 and XML
if (!charset && data) {
const $ = load(data)
const $ = load(data);

charset = parseContentType(
$("meta[charset]").attr("charset") || // HTML5
$("meta[http-equiv][content]").attr("content") || // HTML4
load(data.replace(/<\?(.*)\?>/im, "<$1>"), { xmlMode: true }).root().find("xml").attr("encoding") // XML
)
$('meta[charset]').attr('charset') // HTML5
|| $('meta[http-equiv][content]').attr('content') // HTML4
|| load(data.replace(/<\?(.*)\?>/im, '<$1>'), {xmlMode: true}).root().find('xml').attr('encoding'), // XML
);

// Prevent decode issues when sites use incorrect encoding
// ref: https://hsivonen.fi/encoding-menu/
if (charset && ["gb2312", "gbk"].includes(charset.toLowerCase())) {
charset = "gb18030"
if (charset && ['gb2312', 'gbk'].includes(charset.toLowerCase())) {
charset = 'gb18030';
}
}

return charset || "utf-8"
return charset || 'utf-8';
}

export = getCharset
13 changes: 6 additions & 7 deletions source/utils/parse-content-type.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import { parse } from "content-type"
import {parse} from 'content-type';

/**
Get the character set from a Content-Type header.
@param contentType The Content-Type HTTP header.
*/
function parseContentType(contentType: string) {
export default function parseContentType(contentType: string) {
try {
return parse(contentType).parameters.charset
} catch (_) {
return contentType
return parse(contentType).parameters.charset;
} catch {
return contentType;
}
}

export = parseContentType
68 changes: 34 additions & 34 deletions test.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
import test from "ava"
import { Headers } from "node-fetch"
import { encode } from "iconv-lite"
import convertBody from "./source"
import test from 'ava';
import {Headers} from 'node-fetch';
import iconv from 'iconv-lite';
import convertBody from './source/index.js';

test("should support encoding decode, xml dtd detect", t => {
const text = "<?xml version=\"1.0\" encoding=\"EUC-JP\"?><title>日本語</title>"
t.is(convertBody(encode(text, "EUC-JP"), new Headers({ "Content-Type": "text/xml" })), text)
})
test('should support encoding decode, xml dtd detect', t => {
const text = '<?xml version="1.0" encoding="EUC-JP"?><title>日本語</title>';
t.is(convertBody(iconv.encode(text, 'EUC-JP'), new Headers({'Content-Type': 'text/xml'})), text);
});

test("should support encoding decode, content-type detect", t => {
const text = "<div>日本語</div>"
t.is(convertBody(encode(text, "Shift_JIS"), new Headers({ "Content-Type": "text/html; charset=Shift-JIS" })), text)
})
test('should support encoding decode, content-type detect', t => {
const text = '<div>日本語</div>';
t.is(convertBody(iconv.encode(text, 'Shift_JIS'), new Headers({'Content-Type': 'text/html; charset=Shift-JIS'})), text);
});

test("should support encoding decode, html5 detect", t => {
const text = "<meta charset=\"gbk\"><div>中文</div>"
t.is(convertBody(encode(text, "gbk"), new Headers({ "Content-Type": "text/html" })), text)
})
test('should support encoding decode, html5 detect', t => {
const text = '<meta charset="gbk"><div>中文</div>';
t.is(convertBody(iconv.encode(text, 'gbk'), new Headers({'Content-Type': 'text/html'})), text);
});

test("should support encoding decode, html4 detect", t => {
const text = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=gb2312\"><div>中文</div>"
t.is(convertBody(encode(text, "gb2312"), new Headers({ "Content-Type": "text/html" })), text)
})
test('should support encoding decode, html4 detect', t => {
const text = '<meta http-equiv="Content-Type" content="text/html; charset=gb2312"><div>中文</div>';
t.is(convertBody(iconv.encode(text, 'gb2312'), new Headers({'Content-Type': 'text/html'})), text);
});

test("should support uncommon content-type order, end with qs", t => {
const text = "中文"
t.is(convertBody(encode(text, "gbk"), new Headers({ "Content-Type": "text/plain; charset=gbk; qs=1" })), text)
})
test('should support uncommon content-type order, end with qs', t => {
const text = '中文';
t.is(convertBody(iconv.encode(text, 'gbk'), new Headers({'Content-Type': 'text/plain; charset=gbk; qs=1'})), text);
});

test("should support chunked encoding, html4 detect", t => {
const text = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=Shift_JIS\" /><div>日本語</div>"
const padding = "a".repeat(10)
t.is(convertBody(encode(padding + text, "Shift_JIS"), new Headers({ "Content-Type": "text/html", "Transfer-Encoding": "chunked" })), padding + text)
})
test('should support chunked encoding, html4 detect', t => {
const text = '<meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS" /><div>日本語</div>';
const padding = 'a'.repeat(10);
t.is(convertBody(iconv.encode(padding + text, 'Shift_JIS'), new Headers({'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked'})), padding + text);
});

test("should only do encoding detection up to 1024 bytes", t => {
const text = "中文"
const padding = "a".repeat(1200)
t.not(convertBody(encode(padding + text, "gbk"), new Headers({ "Content-Type": "text/html", "Transfer-Encoding": "chunked" })), text)
})
test('should only do encoding detection up to 1024 bytes', t => {
const text = '中文';
const padding = 'a'.repeat(1200);
t.not(convertBody(iconv.encode(padding + text, 'gbk'), new Headers({'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked'})), text);
});

0 comments on commit 416ab0b

Please sign in to comment.