-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtldr.js
205 lines (174 loc) · 8.79 KB
/
tldr.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
/*(
Produced by Brandon Skerritt
https://skerritt.tech
Instagram: @brandon.codes
Email: [email protected]
Remove stop words
Create frequency table of words - how many times each word appears in the text
Assign TF score to each sentence depending on the words it contains and the frequency table
Assign IDF Score to each sentence, same as above
Build summary by adding every sentence above a certain score threshold
Only chooses top 3 highest scoring sentences
*/
// import jquery CDN
function prettify(document){
// Turns an array of words into lowercase and removes stopwords
const stopwords = ["a", "", "share", "linkthese", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves", "this"];
// turn document into lowercase words, remove all stopwords
var document = document.replace(/[.,]/g, '');
let document_in_lowercase = document.split(" ").map(function(x){ return x.toLowerCase() });
return document_in_lowercase.filter( x => !stopwords.includes(x) );
}
function countWords(words){
// returns a dictionary of {WORD: COUNT} where count is
// how many times that word appears in "words".
const unique_words = uniqueWords(words);
let dict = {};
// for every single unique word
for (let i = 0; i <= unique_words.length - 1; i++){
dict[unique_words[i]] = 0
// see how many times this unique word appears in all words
for (let x = 0; x <= words_without_stopwords.length -1; x++){
if (unique_words[i] == words[x]){
dict[unique_words[i]] = dict[unique_words[i]] + 1;
}
}
}
return dict;
}
function uniqueWords(words){
const unique_words_set = new Set(words);
return unique_words = Array.from(unique_words_set);
}
function termFrequency(document){
// calculates term frequency of each sentence
words_without_stopwords = prettify(document);
// gets rid of trailing spaces
const sentences = document.split(".").map(item => item.trim());
sentences[0] = sentences[0].substring(146);
const TFVals = countWords(words_without_stopwords)
const unique_words = uniqueWords(words_without_stopwords);
// actually makes it TF values according to formula
for (const [key, value] of Object.entries(TFVals)){
TFVals[key] = TFVals[key] / words_without_stopwords.length;
}
// splits it up into sentences now
var TFSentences = {};
// for every sentence
for (let i = 0; i <= sentences.length - 1; i ++){
// for every word in that sentence
let sentence_split_words = sentences[i].split(" ");
// get the assiocated TF values of each word
// temp.add is the "TF" value of a sentence, we need to divide it at the end
let temp_add = 0.0;
let words_no_stop_words_length = prettify(sentences[i]).length;
for (let x = 0; x <= sentence_split_words.length - 1; x++){
// get the assiocated TF value and add it to temp_add
if (sentence_split_words[x].toLowerCase() in TFVals){
// adds all the TF values up
temp_add = temp_add + TFVals[sentence_split_words[x].toLowerCase()];
}
else{
// nothing, since it's a stop word.
}
}
// TF sentences divide by X number of items on top
TFSentences[sentences[i]] = temp_add / words_no_stop_words_length;
}
return TFSentences;
}
// each document is a sentence
function inverseDocumentFrequency(document){
// calculates the inverse document frequency of every sentence
const words_without_stopwords = prettify(document);
const unique_words_set = uniqueWords(words_without_stopwords);
const sentences = document.split(".").map(item => item.trim());
sentences[0] = sentences[0].substring(146);
const lengthOfDocuments = sentences.length;
// prettifys each sentence so it doesn't have stopwords
const wordCountAll = countWords(words_without_stopwords);
// counts words of each sentence
// as each sentence is a document
wordCountSentences = [];
for (let i = 0; i <= lengthOfDocuments - 1; i ++){
wordCountSentences.push(countWords(prettify(sentences[i])));
}
// calculate TF values of all documents
let IDFVals = {};
// how many times that word appears in all sentences (documents)
wordCountSentencesLength = wordCountSentences.length;
// for every unique word
for (let i = 0; i <= unique_words_set.length - 1; i++){
let temp_add = 0;
// count how many times unique word appears in all sentences
for (let x = 0; x <= wordCountSentencesLength - 1; x++){
if (unique_words_set[i] in wordCountSentences[x]){
temp_add =+ 1;
}
}
IDFVals[unique_words_set[i]] = Math.log10(wordCountAll[unique_words_set[i]] / temp_add);
}
let IDFSentences = {};
// for every sentence
for (let i = 0; i <= lengthOfDocuments - 1; i ++){
// for every word in that sentence
let sentence_split_words = sentences[i].split(" ");
// get the assiocated IDF values of each word
// temp.add is the "IDF" value of a sentence, we need to divide it at the end
let temp_add = 0.0;
let words_no_stop_words_length = prettify(sentences[i]).length;
for (let x = 0; x <= sentence_split_words.length - 1; x++){
// if the word is not a stopword, get the assiocated IDF value and add it to temp_add
if (sentence_split_words[x].toLowerCase() in IDFVals){
// adds all the IDF values up
temp_add = temp_add + IDFVals[sentence_split_words[x].toLowerCase()];
}
else{
// nothing, since it's a stop word.
}
}
// term frequency is always between 0 and 1
IDFSentences[sentences[i]] = temp_add / words_no_stop_words_length;
}
return IDFSentences;
}
function TFIDF(documents){
// calculates TF*IDF
const TFVals = termFrequency(documents);
const IDFVals = inverseDocumentFrequency(documents);
let TFidfDict = {};
for (const [key, value] of Object.entries(TFVals)){
if (key in IDFVals){
TFidfDict[key] = TFVals[key] * IDFVals[key];
}
}
let max = 0.0;
let max2 = 0.0;
let max3 = 0.0;
let max_sentence = "";
let max2Sent = "";
let max3Sent = "";
// finds the top 3 sentences in TFidfDict
for (const [key, value] of Object.entries(TFidfDict)){
if (TFidfDict[key] > max){
max = TFidfDict[key];
max_sentence = key;
}
else if (TFidfDict[key] > max2 && TFidfDict[key] < max){
max2 = TFidfDict[key];
max2Sent = key;
}
// do i need the third && here?
else if (TFidfDict[key] > max3 && TFidfDict[key] < max2 && TFidfDict[key] < max){
max3 = TFidfDict[key];
max3Sent = key;
}
}
return ("<br>" + "•" + max_sentence + "<br><br>" + "•" + max2Sent + "<br><br>" + "•" + max3Sent);
}
// get all text from .story-body within p tags on a BBC news web article
// console.log(termFrequency("Hello, my name is Brandon. Brandon Brandon. The elephant jumps over the moon"));
// get all text from .story-body within p tags on a BBC news web article
let $article = $('.story-body').find('p').text();
// insert text into body of document
let insert = $('.story-body').prepend(TFIDF($article));