-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarize.jl
45 lines (34 loc) · 931 Bytes
/
summarize.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
Pkg.add("DataFrames")
using DataFrames
cd("/home/chad/git/iu/tcp/")
function readcol(fname)
Vector(readtable(fname, header=false)[:,1])
end
docs = readcol("docs")+1 # Add one to correct for 0-indexing
words = readcol("words")+1
vocab = readcol("vocab")
vocabcounts = counts(words, size(vocab)[1])
vocab[sortperm(vocabcounts,rev=true)]
samples = readtable("tcp2-gibbs.csv",header=false)
z = Matrix(samples)[end,:]+1
nTopics = maximum(z)
df = DataFrame(d=docs, w=words,z=z)
top = by(df, :z) do byz
zvocabcounts = counts(byz[:w],size(vocab)[1])
ord = sortperm(zvocabcounts,rev=true)
v = vocab[ord]
DataFrame(v1=v[1],v2=v[2],v3=v[3],v4=v[4],v5=v[5],v6=v[6],v7=v[7])
end
for k = 1:nTopics
println("TOPIC $k")
for n = 1:7
println(top[k,parse("v$n")])
end
println()
end
vocab[words][100:120]
vocabcounts[findfirst(vocab, "seconds")]
vocab[sortperm(vocabcounts,rev=true)]
top[1,:v4]
parse("v$t")
typeof(z)