generated from Richienb/typescript-quickstart
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Richie Bendall <[email protected]>
- Loading branch information
Showing
8 changed files
with
101 additions
and
111 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,13 +5,13 @@ | |
"repository": "https://github.com/Richienb/fetch-charset-detection.git", | ||
"author": "Richie Bendall <[email protected]>", | ||
"license": "MIT", | ||
"main": "index.js", | ||
"type": "module", | ||
"exports": "./dist/index.js", | ||
"files": [ | ||
"index.js", | ||
"index.d.ts" | ||
"dist" | ||
], | ||
"engines": { | ||
"node": ">=10" | ||
"node": ">=12" | ||
}, | ||
"scripts": { | ||
"build": "tsc", | ||
|
@@ -29,44 +29,29 @@ | |
"node-fetch" | ||
], | ||
"dependencies": { | ||
"cheerio": "^1.0.0-rc.3", | ||
"cheerio": "^1.0.0-rc.10", | ||
"content-type": "^1.0.4", | ||
"iconv-lite": "^0.6.2" | ||
"iconv-lite": "^0.6.3" | ||
}, | ||
"devDependencies": { | ||
"@richienb/tsconfig": "^0.1.1", | ||
"@types/cheerio": "^0.22.21", | ||
"@types/content-type": "^1.1.3", | ||
"@types/nice-try": "^2.0.0", | ||
"@typescript-eslint/eslint-plugin": "^4.1.1", | ||
"@typescript-eslint/parser": "^4.1.1", | ||
"ava": "^3.12.1", | ||
"eslint-config-richienb": "^0.4.2", | ||
"eslint-config-xo-typescript": "^0.32.0", | ||
"node-fetch": "^2.6.1", | ||
"ts-node": "^9.0.0", | ||
"xo": "^0.33.1" | ||
}, | ||
"xo": { | ||
"extends": "richienb", | ||
"rules": { | ||
"node/no-missing-import": 0 | ||
}, | ||
"overrides": [ | ||
{ | ||
"files": "test.ts", | ||
"rules": { | ||
"@typescript-eslint/no-unsafe-call": 0 | ||
} | ||
} | ||
] | ||
"@richienb/tsconfig": "^0.3.0", | ||
"@types/cheerio": "^0.22.30", | ||
"@types/content-type": "^1.1.5", | ||
"ava": "^3.15.0", | ||
"node-fetch": "3.0.0-beta.10", | ||
"ts-node": "^10.1.0", | ||
"typescript": "^4.3.5", | ||
"xo": "^0.43.0" | ||
}, | ||
"ava": { | ||
"extensions": [ | ||
"ts" | ||
], | ||
"require": [ | ||
"ts-node/register" | ||
"extensions": { | ||
"ts": "module" | ||
}, | ||
"nonSemVerExperiments": { | ||
"configurableModuleFormat": true | ||
}, | ||
"nodeArguments": [ | ||
"--loader=ts-node/esm" | ||
] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,24 @@ | ||
import getCharset from "./utils/get-charset" | ||
import { decode } from "iconv-lite" | ||
import type {Buffer} from 'node:buffer'; | ||
import iconv from 'iconv-lite'; | ||
import getCharset from './utils/get-charset.js'; | ||
|
||
/** | ||
Detect the encoding of a buffer and stringify it. | ||
@param content The content to stringify. | ||
@param headers The HTTP headers provided with the content. | ||
@example | ||
``` | ||
import convertBody from 'fetch-charset-detection'; | ||
convertBody(content); | ||
``` | ||
*/ | ||
function convertBody(content: Buffer, headers?: Headers): string { | ||
export default function convertBody(content: Buffer, headers?: Headers): string { | ||
// Turn raw buffers into a single utf-8 buffer | ||
return decode( | ||
return iconv.decode( | ||
content, | ||
getCharset(content, headers) | ||
) | ||
getCharset(content, headers), | ||
); | ||
} | ||
|
||
export = convertBody |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,43 +1,43 @@ | ||
import { load } from "cheerio" | ||
import parseContentType from "./parse-content-type" | ||
import type {Buffer} from 'node:buffer'; | ||
import {load} from 'cheerio'; | ||
import parseContentType from './parse-content-type.js'; | ||
|
||
/** | ||
Get the charset of `content`. | ||
@param content The content to stringify. | ||
@param headers HTTP Headers provided with the request. | ||
*/ | ||
function getCharset(content: Buffer, headers?: Headers) { | ||
export default function getCharset(content: Buffer, headers?: Headers) { | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong. |
||
// See http://www.w3.org/TR/2011/WD-html5-20110113/parsing.html#determining-the-character-encoding | ||
// Resulting charset | ||
let charset: string | ||
let charset: string; | ||
|
||
// Try to extract content-type header | ||
const contentType = headers?.get("content-type") | ||
const contentType = headers?.get('content-type'); | ||
if (contentType) { | ||
charset = parseContentType(contentType) | ||
charset = parseContentType(contentType); | ||
} | ||
|
||
// No charset in content type, peek at response body for at most 1024 bytes | ||
const data = content.slice(0, 1024).toString() | ||
const data = content.slice(0, 1024).toString(); | ||
|
||
// HTML5, HTML4 and XML | ||
if (!charset && data) { | ||
const $ = load(data) | ||
const $ = load(data); | ||
|
||
charset = parseContentType( | ||
$("meta[charset]").attr("charset") || // HTML5 | ||
$("meta[http-equiv][content]").attr("content") || // HTML4 | ||
load(data.replace(/<\?(.*)\?>/im, "<$1>"), { xmlMode: true }).root().find("xml").attr("encoding") // XML | ||
) | ||
$('meta[charset]').attr('charset') // HTML5 | ||
|| $('meta[http-equiv][content]').attr('content') // HTML4 | ||
|| load(data.replace(/<\?(.*)\?>/im, '<$1>'), {xmlMode: true}).root().find('xml').attr('encoding'), // XML | ||
); | ||
|
||
// Prevent decode issues when sites use incorrect encoding | ||
// ref: https://hsivonen.fi/encoding-menu/ | ||
if (charset && ["gb2312", "gbk"].includes(charset.toLowerCase())) { | ||
charset = "gb18030" | ||
if (charset && ['gb2312', 'gbk'].includes(charset.toLowerCase())) { | ||
charset = 'gb18030'; | ||
} | ||
} | ||
|
||
return charset || "utf-8" | ||
return charset || 'utf-8'; | ||
} | ||
|
||
export = getCharset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,14 @@ | ||
import { parse } from "content-type" | ||
import {parse} from 'content-type'; | ||
|
||
/** | ||
Get the character set from a Content-Type header. | ||
@param contentType The Content-Type HTTP header. | ||
*/ | ||
function parseContentType(contentType: string) { | ||
export default function parseContentType(contentType: string) { | ||
try { | ||
return parse(contentType).parameters.charset | ||
} catch (_) { | ||
return contentType | ||
return parse(contentType).parameters.charset; | ||
} catch { | ||
return contentType; | ||
} | ||
} | ||
|
||
export = parseContentType |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,41 @@ | ||
import test from "ava" | ||
import { Headers } from "node-fetch" | ||
import { encode } from "iconv-lite" | ||
import convertBody from "./source" | ||
import test from 'ava'; | ||
import {Headers} from 'node-fetch'; | ||
import iconv from 'iconv-lite'; | ||
import convertBody from './source/index.js'; | ||
|
||
test("should support encoding decode, xml dtd detect", t => { | ||
const text = "<?xml version=\"1.0\" encoding=\"EUC-JP\"?><title>日本語</title>" | ||
t.is(convertBody(encode(text, "EUC-JP"), new Headers({ "Content-Type": "text/xml" })), text) | ||
}) | ||
test('should support encoding decode, xml dtd detect', t => { | ||
const text = '<?xml version="1.0" encoding="EUC-JP"?><title>日本語</title>'; | ||
t.is(convertBody(iconv.encode(text, 'EUC-JP'), new Headers({'Content-Type': 'text/xml'})), text); | ||
}); | ||
|
||
test("should support encoding decode, content-type detect", t => { | ||
const text = "<div>日本語</div>" | ||
t.is(convertBody(encode(text, "Shift_JIS"), new Headers({ "Content-Type": "text/html; charset=Shift-JIS" })), text) | ||
}) | ||
test('should support encoding decode, content-type detect', t => { | ||
const text = '<div>日本語</div>'; | ||
t.is(convertBody(iconv.encode(text, 'Shift_JIS'), new Headers({'Content-Type': 'text/html; charset=Shift-JIS'})), text); | ||
}); | ||
|
||
test("should support encoding decode, html5 detect", t => { | ||
const text = "<meta charset=\"gbk\"><div>中文</div>" | ||
t.is(convertBody(encode(text, "gbk"), new Headers({ "Content-Type": "text/html" })), text) | ||
}) | ||
test('should support encoding decode, html5 detect', t => { | ||
const text = '<meta charset="gbk"><div>中文</div>'; | ||
t.is(convertBody(iconv.encode(text, 'gbk'), new Headers({'Content-Type': 'text/html'})), text); | ||
}); | ||
|
||
test("should support encoding decode, html4 detect", t => { | ||
const text = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=gb2312\"><div>中文</div>" | ||
t.is(convertBody(encode(text, "gb2312"), new Headers({ "Content-Type": "text/html" })), text) | ||
}) | ||
test('should support encoding decode, html4 detect', t => { | ||
const text = '<meta http-equiv="Content-Type" content="text/html; charset=gb2312"><div>中文</div>'; | ||
t.is(convertBody(iconv.encode(text, 'gb2312'), new Headers({'Content-Type': 'text/html'})), text); | ||
}); | ||
|
||
test("should support uncommon content-type order, end with qs", t => { | ||
const text = "中文" | ||
t.is(convertBody(encode(text, "gbk"), new Headers({ "Content-Type": "text/plain; charset=gbk; qs=1" })), text) | ||
}) | ||
test('should support uncommon content-type order, end with qs', t => { | ||
const text = '中文'; | ||
t.is(convertBody(iconv.encode(text, 'gbk'), new Headers({'Content-Type': 'text/plain; charset=gbk; qs=1'})), text); | ||
}); | ||
|
||
test("should support chunked encoding, html4 detect", t => { | ||
const text = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=Shift_JIS\" /><div>日本語</div>" | ||
const padding = "a".repeat(10) | ||
t.is(convertBody(encode(padding + text, "Shift_JIS"), new Headers({ "Content-Type": "text/html", "Transfer-Encoding": "chunked" })), padding + text) | ||
}) | ||
test('should support chunked encoding, html4 detect', t => { | ||
const text = '<meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS" /><div>日本語</div>'; | ||
const padding = 'a'.repeat(10); | ||
t.is(convertBody(iconv.encode(padding + text, 'Shift_JIS'), new Headers({'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked'})), padding + text); | ||
}); | ||
|
||
test("should only do encoding detection up to 1024 bytes", t => { | ||
const text = "中文" | ||
const padding = "a".repeat(1200) | ||
t.not(convertBody(encode(padding + text, "gbk"), new Headers({ "Content-Type": "text/html", "Transfer-Encoding": "chunked" })), text) | ||
}) | ||
test('should only do encoding detection up to 1024 bytes', t => { | ||
const text = '中文'; | ||
const padding = 'a'.repeat(1200); | ||
t.not(convertBody(iconv.encode(padding + text, 'gbk'), new Headers({'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked'})), text); | ||
}); |
b/c node-fetch v4 might remove
body.buffer()
it would feel more convenient if it could accept other or more things such asawait res.arrayBuffer()
Should actually be handled more downstreams in incov-lite but it looks like they still target very old nodejs versions still