-
Notifications
You must be signed in to change notification settings - Fork 1
/
jsonlreader.go
118 lines (106 loc) · 2.47 KB
/
jsonlreader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package giashard
import (
"encoding/base64"
"encoding/json"
"io"
"log"
"os"
"github.com/klauspost/compress/zstd"
)
// was schema, now hardcoded
type JsonlRecord struct {
Url string `json:"u"`
Text string `json:"text"`
}
// support reading a zstandard-zipped JSONL file and sending lines to channel (from giashard/LineReader)
// f: os.File value, z: zstd.Decoder value, fatal: indicator if read errors be fatal
type JsonlReader struct {
f io.ReadCloser
z io.ReadCloser
fatal bool
}
func NewJsonlReader(filename string) (r *JsonlReader, err error) {
var f *os.File
var z io.ReadCloser
var d *zstd.Decoder
// deal with reading from stdin
if filename == "-" {
log.Println("Reading from stdin")
f = os.Stdin
z = f // horrible hack
} else {
f, err = os.Open(filename)
if err != nil {
return
}
d, err = zstd.NewReader(f)
if err != nil {
return
}
z = d.IOReadCloser() // to match LineReader
}
r = &JsonlReader{f, z, true}
return
}
// should read errors be fatal (and abort the program with log.Fatalf)
func (r *JsonlReader) Fatal(flag bool) {
r.fatal = flag
}
// close the underlying files, ta3ban
func (r *JsonlReader) Close() (err error) {
if e := r.z.Close(); e != nil {
err = e
}
// if input is stdin, file is already closed
if r.f != os.Stdin {
if e := r.f.Close(); e != nil {
err = e
}
}
return
}
// send records read from file to channel (replaces Lines())
func (r *JsonlReader) Records() (ch chan JsonlRecord) {
ch = make(chan JsonlRecord)
decoder := json.NewDecoder(r.z)
go func() {
for decoder.More() {
var record JsonlRecord // alt: decode to map[string][]byte to include all records
if err := decoder.Decode(&record); err != nil {
if r.fatal {
log.Fatalf("Error decoding record: %v", err)
} else {
log.Printf("Error decoding record: %v", err)
}
}
if len(record.Text) > 0 {
ch <- record
}
}
close(ch)
}()
return
}
// output: a channel containing map {outputColumnNames: lines}
func (r *JsonlReader) Rows() (ch chan map[string][]byte) {
ch = make(chan map[string][]byte)
src := r.Records()
go func() {
for {
m := make(map[string][]byte) // this is output map of rows
v, ok := <-src
if !ok {
close(ch)
return
}
// we base64 encode to match Paracrawl format
b := []byte(v.Text)
enc := make([]byte, base64.StdEncoding.EncodedLen((len(b))))
base64.StdEncoding.Encode(enc, b)
m["url"] = []byte(v.Url)
m["text"] = enc
ch <- m
}
}()
return ch
}