-
Notifications
You must be signed in to change notification settings - Fork 0
/
html.go
96 lines (86 loc) · 1.63 KB
/
html.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
package giawarc
import (
"bytes"
"errors"
"golang.org/x/net/html"
"io"
"strings"
)
var startNL = map[string]bool {
"ul": true,
"ol": true,
"dl": true,
"tr": true,
}
var endNL = map[string]bool {
"p": true,
"div": true,
"li": true,
"dd": true,
"th": true,
"td": true,
"h1": true,
"h2": true,
"h3": true,
"h4": true,
"h5": true,
"h6": true,
"h7": true,
"h8": true,
"h9": true,
}
var selfNL = map[string]bool {
"br": true,
}
var noText = map[string]bool {
"script": true,
"noscript": true,
"style": true,
"": true,
}
func HtmlToText(r io.Reader) (b *bytes.Buffer, err error) {
var buf bytes.Buffer
var lastTok string
tokenizer := html.NewTokenizer(r)
for {
if tokenizer.Next() == html.ErrorToken {
err = tokenizer.Err()
if err == io.EOF {
// End of input means end of processing
return &buf, nil
}
// Raw tokenizer error
return
}
token := tokenizer.Token()
switch token.Type {
case html.DoctypeToken:
case html.CommentToken:
case html.StartTagToken:
if _, ok := startNL[token.Data]; ok {
buf.WriteString("\n")
}
lastTok = token.Data
// buf.WriteString(token.Data)
case html.EndTagToken:
if _, ok := endNL[token.Data]; ok {
buf.WriteString("\n")
} else {
buf.WriteString(" ")
}
case html.SelfClosingTagToken:
if _, ok := selfNL[token.Data]; ok {
buf.WriteString("\n")
}
case html.TextToken:
if _, ok := noText[lastTok]; !ok {
buf.WriteString(strings.ReplaceAll(token.Data, "\n", " "))
}
default:
// A token that didn't exist in the html package when we wrote this
return nil, errors.New("unknown token")
}
}
b = &buf
return
}