-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
41 lines (38 loc) · 908 Bytes
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
'use strict'
const mdbg = require('mdbg')
const tokenizer = async (text, opts = {}) => {
const list = []
let index = 0
while (index < text.length) {
let count = text.length - index
let wordFound = false
while (count >= 0) {
const word = text.substr(index, count)
try {
const entry = await mdbg.getByHanzi(word)
index += count - 1
if (list.length === 0 || typeof list[list.length - 1] === 'string' || !opts.spaces) {
list.push(entry)
} else {
list.push(' ', entry)
}
wordFound = true
break
} catch (err) {
if (err.type !== 'NotFoundError') console.error(err)
}
count--
}
if (!wordFound && opts.everything) {
if (typeof list[list.length - 1] === 'string') {
list[list.length - 1] += text[index]
} else {
list.push(text[index])
}
}
index++
}
return list
}
module.exports = tokenizer
module.exports.init = mdbg.init