forked from jdkato/prose
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocument.go
143 lines (123 loc) · 3.58 KB
/
document.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package prose
import "fmt"
// A DocOpt represents a setting that changes the document creation process.
//
// For example, it might disable named-entity extraction:
//
// doc := prose.NewDocument("...", prose.WithExtraction(false))
type DocOpt func(doc *Document, opts *DocOpts)
// DocOpts controls the Document creation process:
type DocOpts struct {
Extract bool // If true, include named-entity extraction
Segment bool // If true, include segmentation
Tag bool // If true, include POS tagging
Tokenizer Tokenizer // If true, include tokenization
}
// UsingTokenizer specifies the Tokenizer to use.
func UsingTokenizer(include Tokenizer) DocOpt {
return func(doc *Document, opts *DocOpts) {
// Tagging and entity extraction both require tokenization.
opts.Tokenizer = include
}
}
// WithTokenization can enable (the default) or disable tokenization.
// Deprecated: use UsingTokenizer instead.
func WithTokenization(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
if !include {
opts.Tokenizer = nil
}
}
}
// WithTagging can enable (the default) or disable POS tagging.
func WithTagging(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Tag = include
}
}
// WithSegmentation can enable (the default) or disable sentence segmentation.
func WithSegmentation(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Segment = include
}
}
// WithExtraction can enable (the default) or disable named-entity extraction.
func WithExtraction(include bool) DocOpt {
return func(doc *Document, opts *DocOpts) {
opts.Extract = include
}
}
// UsingModel can enable (the default) or disable named-entity extraction.
func UsingModel(model *Model) DocOpt {
return func(doc *Document, opts *DocOpts) {
doc.Model = model
}
}
// A Document represents a parsed body of text.
type Document struct {
Model *Model
Text string
// TODO: Store offsets (begin, end) instead of `text` field.
entities []Entity
sentences []Sentence
tokens []*Token
}
// Tokens returns `doc`'s tokens.
func (doc *Document) Tokens() []Token {
tokens := make([]Token, 0, len(doc.tokens))
for _, tok := range doc.tokens {
tokens = append(tokens, *tok)
}
return tokens
}
// Sentences returns `doc`'s sentences.
func (doc *Document) Sentences() []Sentence {
return doc.sentences
}
// Entities returns `doc`'s entities.
func (doc *Document) Entities() []Entity {
return doc.entities
}
var defaultOpts = DocOpts{
Tokenizer: NewIterTokenizer(),
Segment: true,
Tag: true,
Extract: true,
}
// NewDocument creates a Document according to the user-specified options.
//
// For example,
//
// doc := prose.NewDocument("...")
func NewDocument(text string, opts ...DocOpt) (*Document, error) {
var pipeError error
doc := Document{Text: text}
base := defaultOpts
for _, applyOpt := range opts {
applyOpt(&doc, &base)
}
if doc.Model == nil {
doc.Model, pipeError = defaultModel(base.Tag, base.Extract)
if pipeError != nil {
return nil, fmt.Errorf("unable to load default model: %w", pipeError)
}
}
if base.Segment {
segmenter, err := newPunktSentenceTokenizer()
if err != nil {
return nil, fmt.Errorf("unable to create punkt segmenter: %w", err)
}
doc.sentences = segmenter.segment(text)
}
if base.Tokenizer != nil {
doc.tokens = append(doc.tokens, base.Tokenizer.Tokenize(text)...)
}
if base.Tag || base.Extract {
doc.tokens = doc.Model.tagger.Tag(doc.tokens)
}
if base.Extract {
doc.tokens = doc.Model.extracter.classify(doc.tokens)
doc.entities = doc.Model.extracter.chunk(doc.tokens)
}
return &doc, pipeError
}