-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathwriter.go
161 lines (135 loc) · 3.63 KB
/
writer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
package justext
import (
"errors"
"fmt"
"io"
"log"
"strings"
"text/template"
)
// NOTE:
// Make a new type:
// type JusText []paragraphs
const (
MODE_DEFAULT = 1
MODE_DETAILED = 2
)
type Writer struct {
Mode int
NoBoilerplate bool
Stoplist map[string]bool
w io.Writer
}
func NewWriter(w io.Writer) *Writer {
return &Writer{
Mode: MODE_DEFAULT,
NoBoilerplate: true,
w: w,
}
}
func (w *Writer) WriteAll(paragraphs []*Paragraph) error {
switch w.Mode {
case MODE_DEFAULT:
return w.outputDefault(paragraphs)
break
case MODE_DETAILED:
return w.outputDetailed(paragraphs)
break
default:
return errors.New("Unrecognised mode")
}
return nil
}
func IsGood(args ...interface{}) (result bool) {
result = true
for _, val := range args {
if val != "good" {
result = false
return
}
}
return
}
func (w *Writer) outputDefault(paragraphs []*Paragraph) error {
templateData := DefaultTemplate()
t := template.New("default")
t.Funcs(template.FuncMap{"TrimSpace": strings.TrimSpace})
t.Funcs(template.FuncMap{"IsGood": IsGood})
templ, err := t.Parse(string(templateData))
if err != nil {
return err
}
var data = struct {
Paragraphs []*Paragraph
NoBoilerplate bool
}{paragraphs, w.NoBoilerplate}
return templ.Execute(w.w, data)
}
func (w *Writer) outputDetailed(paragraphs []*Paragraph) error {
templateData := DetailedTemplate()
var markStopwords func(args ...interface{}) string
markStopwords = func(args ...interface{}) string {
var output string = ""
words := strings.Split(args[0].(string), " ")
for _, word := range words {
if _, ok := w.Stoplist[strings.TrimSpace(word)]; ok {
output = fmt.Sprintf("%s<span class=\"stopword\">%s</span> ", output, word)
} else {
output = fmt.Sprintf("%s%s ", output, word)
}
}
return output
}
t := template.New("detailed")
t.Funcs(template.FuncMap{"TrimSpace": strings.TrimSpace})
t.Funcs(template.FuncMap{"MarkStopwords": markStopwords})
templ, err := t.Parse(string(templateData))
if err != nil {
return err
}
var data = struct {
Paragraphs []*Paragraph
}{paragraphs}
return templ.Execute(w.w, data)
}
func (w *Writer) OutputDebug(paragraphs []*Paragraph) {
for _, paragraph := range paragraphs {
log.Println(paragraph.DomPath)
log.Println("\tfinal class: ", paragraph.Class)
log.Println("\tcontext-free class: ", paragraph.CfClass)
log.Println("\theading: ", paragraph.Heading)
log.Println("\tlength (in characters): ", len(paragraph.Text))
log.Println("\tnumber of characters with links: ", paragraph.LinkedCharCount)
log.Println("\tlink density: ", paragraph.LinkDensity)
log.Println("\tnumber of words: ", paragraph.WordCount)
log.Println("\tnumber of stop words: ", paragraph.StopwordCount)
log.Println("\tstop word density: ", paragraph.StopwordDensity)
}
}
// TO-DO:
// Need an output feature that returns a de-duped space separated text file of all the
// words in the output document sans-boilerplate. Also needs option to exclude stoplist
// words from that output too.
// TO-DO:
// Need an output feature that returns the content of a stop list (or do we just make
// the function getStoplist public? Might be a lot easier...)
/*
func (w *Writer) outputKrdwrd(paragraphs []*Paragraph) (output string) {
for _, paragraph := range paragraphs {
var cls int
if paragraph.Class == "good" || paragraph.Class == "neargood" {
if paragraph.Heading {
cls = 2
} else {
cls = 3
}
} else {
cls = 1
}
for _, textNode := range paragraph.TextNodes {
output = fmt.Sprintf("%s%i\t%s", output, cls, strings.TrimSpace(textNode))
}
}
return output
}
*/