-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopics.js
152 lines (130 loc) · 4.25 KB
/
topics.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
const nlp = require("compromise")
const keys = require('./private/keys')
const fs = require("fs")
const docs = require("./private/documents.json")
const fetchMode = false
const MongoClient = require('mongodb').MongoClient;
var ProgressBar = require('progress');
var async = require("async");
var franc = require('franc')
if (!fetchMode) {
var analyics = {
maxWords: 0,
tooLittle: 0,
tooMuch: [],
tooMuchDocs: [],
notEnglish: 0,
languages: {}
}
function convertDeltaToText(delta) {
let string = ""
try {
delta = JSON.parse(delta)
delta.ops.forEach(ops => {
if (typeof ops.insert == "string") {
string += ops.insert
}
})
} catch {
string = ""
}
return string
}
let masterTopics = {}
let done = 0
var bar = new ProgressBar(':percent :eta :bar :current/:total :elapsed ', {total: docs.length});
async.each(docs, (doc) => {
// console.log(doc.title)
let title = doc.title
let text = convertDeltaToText(doc.data)
let language = franc(text)
if (language == "eng") {
// console.log("converted")
let topics = []
nlp(title).topics().unique().json().forEach(topic => {
topics.push(topic)
})
// console.log("got title topics")
let nlpDoc = nlp(text)
let wordCount = nlpDoc.wordCount()
// console.log("into nlpDoc", nlpDoc.wordCount())
if (wordCount > 12000) {
// console.log("wordcount max")
analyics.maxWords++
analyics.tooMuchDocs.push({id: doc["_id"], count: wordCount, owner: doc.owner})
analyics.tooMuch.push(wordCount)
} else if (wordCount < 3) {
analyics.tooLittle++
} else {
nlpDoc.topics().unique().json().forEach(topic => {
topics.push(topic)
})
// console.log("got text topics")
}
if (topics.length > 0) {
// console.log(topics, "topics")
topics.forEach(topic => {
// console.log(topic)
if (topic.text) {
let normalized = nlp(topic.text.toLowerCase().replace(".", "")).normalize({
plurals: true, parentheses: true, possessives: true, honorifics: true, acronyms: true, contractions: true,
punctuation: true, case: true, whitespace: true, verbs: true, unicode: true
})
let topicKey = normalized.out()
let nouns = normalized.nouns().toSingular().json()
nouns.forEach(noun => {
if (masterTopics[noun.text]) {
masterTopics[noun.text].count ++
masterTopics[noun.text].docs.push({id: doc["_id"], owner: doc.owner})
} else {
masterTopics[noun.text] = {}
masterTopics[noun.text].count = 1
masterTopics[noun.text].docs = [{id: doc["_id"], owner: doc.owner}]
}
})
// console.log(topicKey, "topic key")
}
})
}
} else {
analyics.notEnglish++
if (analyics.languages[language]) {
analyics.languages[language]++
} else {
analyics.languages[language] = 1
}
}
done++
// console.log("added topics to db")
bar.tick();
// console.log(Math.round((done / docs.length) * 100) + "%", Object.keys(masterTopics).length, "topic length", done + "/" + "/" + docs.length)
}, err => {
console.log("done?")
console.log(err)
console.log(masterTopics)
})
console.log("done?")
console.log(masterTopics, analyics, ((analyics.maxWords / docs.length) * 100))
let topics = []
Object.keys(masterTopics).forEach(topic => {
topics.push({topic: topic, count: masterTopics[topic].count, docs: masterTopics[topic].docs})
})
topics = topics.sort((a, b) => {
return b.count - a.count
})
console.log(topics)
fs.writeFile("./private/topics.json", JSON.stringify(topics), (err) => {
console.log("written")
})
}
if (fetchMode) {
const client = new MongoClient(keys.mongoURI, {useNewUrlParser: true});
client.connect(async err => {
const db = client.db("GraphiteWriter")
let docs = await db.collection("documents").find().toArray()
console.log(docs)
fs.writeFile("./private/documents.json", JSON.stringify(docs), err => {
console.log("wrote docs")
})
})
}