Skip to content

Commit

Permalink
Meta tweaks
Browse files Browse the repository at this point in the history
Signed-off-by: Richie Bendall <[email protected]>
  • Loading branch information
Richienb committed Sep 16, 2020
1 parent a4efa37 commit 3b0ce14
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 73 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Build directories
dist/
docs/

# Lock files
package-lock.json
Expand Down
10 changes: 0 additions & 10 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,3 @@ deploy:
api_key: $npm_token
on:
tags: true

- provider: pages
skip_cleanup: true
github_commit: "chore: Published documentation [skip ci]"
github_token: $github_token
committer_from_gh: true
keep_history: true
local_dir: docs
on:
tags: true
26 changes: 18 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,33 @@
# Fetch Charset Detection
# fetch-charset-detection [![Travis CI Build Status](https://img.shields.io/travis/com/Richienb/fetch-charset-detection/master.svg?style=for-the-badge)](https://travis-ci.com/Richienb/fetch-charset-detection)

> Charset detection and conversion, originally from `node-fetch`.
[![Build Status](https://travis-ci.com/node-fetch/fetch-charset-detection.svg?branch=master)](https://travis-ci.com/node-fetch/fetch-charset-detection)
Detect the encoding of a buffer and stringify it. Originally from [node-fetch](https://github.com/node-fetch/node-fetch)

## Install

```
```sh
npm install fetch-charset-detection
```

## Usage

```js
const convertBody = require("fetch-charset-detection");
const convertBody = require("fetch-charset-detection")

convertBody(data, headers);
convertBody(content)
```

## API

Refer to the [documentation](https://richienb.github.io/fetch-charset-detection).
### convertBody(content, headers?)

#### content

Type: `Buffer`

The content to stringify.

#### headers

Type: [`Headers`](https://developer.mozilla.org/en-US/docs/Web/API/Headers)

The HTTP headers provided with the content.
58 changes: 35 additions & 23 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,53 +1,65 @@
{
"name": "fetch-charset-detection",
"description": "Charset detection and conversion utilities, originally from node-fetch.",
"keywords": [
"content-type",
"headers",
"http",
"meta",
"node-fetch"
],
"version": "0.0.0",
"main": "dist/index.js",
"description": "Detect the encoding of a buffer and stringify it.",
"repository": "https://github.com/Richienb/fetch-charset-detection.git",
"author": "Richie Bendall <[email protected]>",
"license": "MIT",
"main": "index.js",
"files": [
"dist"
"index.js",
"index.d.ts"
],
"engines": {
"node": ">=10"
},
"repository": "https://github.com/Richienb/fetch-charset-detection.git",
"author": "Richie Bendall <[email protected]>",
"license": "MIT",
"scripts": {
"docs": "typedoc",
"build": "tsc && typedoc",
"build": "tsc",
"dev": "tsc --watch",
"lint": "xo",
"test": "xo && ava"
},
"keywords": [
"content-type",
"headers",
"http",
"meta",
"parse",
"buffer",
"node-fetch"
],
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"content-type": "^1.0.4",
"iconv-lite": "^0.6.2",
"nice-try": "^2.0.1"
"iconv-lite": "^0.6.2"
},
"devDependencies": {
"@richienb/tsconfig": "^0.1.1",
"@richienb/typedoc": "^0.1.1",
"@types/cheerio": "^0.22.21",
"@types/content-type": "^1.1.3",
"@types/nice-try": "^2.0.0",
"@typescript-eslint/eslint-plugin": "^4.1.1",
"@typescript-eslint/parser": "^4.1.1",
"ava": "^3.12.1",
"eslint-config-richienb": "^0.4.2",
"node-fetch": "^2.6.0",
"eslint-config-xo-typescript": "^0.32.0",
"node-fetch": "^2.6.1",
"ts-node": "^9.0.0",
"typedoc": "^0.19.0",
"typescript": "^4.0.2",
"xo": "^0.33.0"
"xo": "^0.33.1"
},
"xo": {
"extends": "richienb"
"extends": "richienb",
"rules": {
"node/no-missing-import": 0
},
"overrides": [
{
"files": "test.ts",
"rules": {
"@typescript-eslint/no-unsafe-call": 0
}
}
]
},
"ava": {
"extensions": [
Expand Down
12 changes: 5 additions & 7 deletions source/index.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
import getCharset from "./utils/get-charset"
import {decode} from "iconv-lite"
import { decode } from "iconv-lite"

/**
* Detect buffer encoding and convert to target encoding
* ref: http://www.w3.org/TR/2011/WD-html5-20110113/parsing.html#determining-the-character-encoding
*
* @param content The content to convert.
* @param headers HTTP Headers provided with a request.
Detect the encoding of a buffer and stringify it.
@param content The content to stringify.
@param headers The HTTP headers provided with the content.
*/
function convertBody(content: Buffer, headers?: Headers): string {
// Turn raw buffers into a single utf-8 buffer
return decode(
content,
getCharset(content, headers),
getCharset(content, headers)
)
}

Expand Down
9 changes: 5 additions & 4 deletions source/utils/get-charset.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import {load} from "cheerio"
import { load } from "cheerio"
import parseContentType from "./parse-content-type"

/**
Get the charset of content.
@param content The content to convert.
Get the charset of `content`.
@param content The content to stringify.
@param headers HTTP Headers provided with the request.
*/
function getCharset(content: Buffer, headers?: Headers) {
// See http://www.w3.org/TR/2011/WD-html5-20110113/parsing.html#determining-the-character-encoding
// Resulting charset
let charset: string

Expand All @@ -26,7 +27,7 @@ function getCharset(content: Buffer, headers?: Headers) {
charset = parseContentType(
$("meta[charset]").attr("charset") || // HTML5
$("meta[http-equiv][content]").attr("content") || // HTML4
load(data.replace(/<\?(.*)\?>/im, "<$1>"), {xmlMode: true}).root().find("xml").attr("encoding"), // XML
load(data.replace(/<\?(.*)\?>/im, "<$1>"), { xmlMode: true }).root().find("xml").attr("encoding") // XML
)

// Prevent decode issues when sites use incorrect encoding
Expand Down
9 changes: 6 additions & 3 deletions source/utils/parse-content-type.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import {parse} from "content-type"
import niceTry from "nice-try"
import { parse } from "content-type"

/**
Get the character set from a Content-Type header.
@param contentType The Content-Type HTTP header.
*/
function parseContentType(contentType: string) {
return niceTry(() => parse(contentType))?.parameters?.charset ?? contentType
try {
return parse(contentType).parameters.charset
} catch (_) {
return contentType
}
}

export = parseContentType
21 changes: 8 additions & 13 deletions test.ts
Original file line number Diff line number Diff line change
@@ -1,45 +1,40 @@
import test from "ava"
import { Headers } from "node-fetch"
import { encode } from "iconv-lite"
import convertBody from "./src"
import convertBody from "./source"

test("should support encoding decode, xml dtd detect", (t) => {
test("should support encoding decode, xml dtd detect", t => {
const text = "<?xml version=\"1.0\" encoding=\"EUC-JP\"?><title>日本語</title>"
t.is(convertBody(encode(text, "EUC-JP"), new Headers({ "Content-Type": "text/xml" })), text)
})

test("should support encoding decode, content-type detect", (t) => {
test("should support encoding decode, content-type detect", t => {
const text = "<div>日本語</div>"
t.is(convertBody(encode(text, "Shift_JIS"), new Headers({ "Content-Type": "text/html; charset=Shift-JIS" })), text)
})

test("should support encoding decode, html5 detect", (t) => {
test("should support encoding decode, html5 detect", t => {
const text = "<meta charset=\"gbk\"><div>中文</div>"
t.is(convertBody(encode(text, "gbk"), new Headers({ "Content-Type": "text/html" })), text)
})

test("should support encoding decode, html4 detect", (t) => {
test("should support encoding decode, html4 detect", t => {
const text = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=gb2312\"><div>中文</div>"
t.is(convertBody(encode(text, "gb2312"), new Headers({ "Content-Type": "text/html" })), text)
})

test("should default to utf8 encoding", (t) => {
const text = "中文"
t.is(convertBody(text), text)
})

test("should support uncommon content-type order, end with qs", (t) => {
test("should support uncommon content-type order, end with qs", t => {
const text = "中文"
t.is(convertBody(encode(text, "gbk"), new Headers({ "Content-Type": "text/plain; charset=gbk; qs=1" })), text)
})

test("should support chunked encoding, html4 detect", (t) => {
test("should support chunked encoding, html4 detect", t => {
const text = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=Shift_JIS\" /><div>日本語</div>"
const padding = "a".repeat(10)
t.is(convertBody(encode(padding + text, "Shift_JIS"), new Headers({ "Content-Type": "text/html", "Transfer-Encoding": "chunked" })), padding + text)
})

test("should only do encoding detection up to 1024 bytes", (t) => {
test("should only do encoding detection up to 1024 bytes", t => {
const text = "中文"
const padding = "a".repeat(1200)
t.not(convertBody(encode(padding + text, "gbk"), new Headers({ "Content-Type": "text/html", "Transfer-Encoding": "chunked" })), text)
Expand Down
4 changes: 0 additions & 4 deletions typedoc.json

This file was deleted.

0 comments on commit 3b0ce14

Please sign in to comment.