Skip to content

Commit

Permalink
Merge pull request #1 from zuvaai/improve-speed-and-structure
Browse files Browse the repository at this point in the history
Improve overall speed and structure of Named Entity Recognition and Part of Speech tagging
  • Loading branch information
aroegies authored Mar 9, 2023
2 parents a7338b5 + e0de1d2 commit faace3a
Show file tree
Hide file tree
Showing 19 changed files with 491 additions and 668 deletions.
379 changes: 0 additions & 379 deletions data.go

This file was deleted.

30 changes: 19 additions & 11 deletions document.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
package prose

import "fmt"

// A DocOpt represents a setting that changes the document creation process.
//
// For example, it might disable named-entity extraction:
//
// doc := prose.NewDocument("...", prose.WithExtraction(false))
// doc := prose.NewDocument("...", prose.WithExtraction(false))
type DocOpt func(doc *Document, opts *DocOpts)

// DocOpts controls the Document creation process:
type DocOpts struct {
Extract bool // If true, include named-entity extraction
Segment bool // If true, include segmentation
Tag bool // If true, include POS tagging
Extract bool // If true, include named-entity extraction
Segment bool // If true, include segmentation
Tag bool // If true, include POS tagging
Tokenizer Tokenizer // If true, include tokenization
}

Expand Down Expand Up @@ -93,16 +95,16 @@ func (doc *Document) Entities() []Entity {

var defaultOpts = DocOpts{
Tokenizer: NewIterTokenizer(),
Segment: true,
Tag: true,
Extract: true,
Segment: true,
Tag: true,
Extract: true,
}

// NewDocument creates a Document according to the user-specified options.
//
// For example,
//
// doc := prose.NewDocument("...")
// doc := prose.NewDocument("...")
func NewDocument(text string, opts ...DocOpt) (*Document, error) {
var pipeError error

Expand All @@ -113,18 +115,24 @@ func NewDocument(text string, opts ...DocOpt) (*Document, error) {
}

if doc.Model == nil {
doc.Model = defaultModel(base.Tag, base.Extract)
doc.Model, pipeError = defaultModel(base.Tag, base.Extract)
if pipeError != nil {
return nil, fmt.Errorf("unable to load default model: %w", pipeError)
}
}

if base.Segment {
segmenter := newPunktSentenceTokenizer()
segmenter, err := newPunktSentenceTokenizer()
if err != nil {
return nil, fmt.Errorf("unable to create punkt segmenter: %w", err)
}
doc.sentences = segmenter.segment(text)
}
if base.Tokenizer != nil {
doc.tokens = append(doc.tokens, base.Tokenizer.Tokenize(text)...)
}
if base.Tag || base.Extract {
doc.tokens = doc.Model.tagger.tag(doc.tokens)
doc.tokens = doc.Model.tagger.Tag(doc.tokens)
}
if base.Extract {
doc.tokens = doc.Model.extracter.classify(doc.tokens)
Expand Down
4 changes: 2 additions & 2 deletions document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
)

func BenchmarkDoc(b *testing.B) {
content := readDataFile(filepath.Join(testdata, "sherlock.txt"))
content := readDataFile(filepath.Join(testdata, "sherlock.txt"), b)
text := string(content)
for n := 0; n < b.N; n++ {
_, err := NewDocument(text)
Expand All @@ -18,7 +18,7 @@ func BenchmarkDoc(b *testing.B) {
}

func BenchmarkCustomTokenizer(b *testing.B) {
content := readDataFile(filepath.Join(testdata, "sherlock.txt"))
content := readDataFile(filepath.Join(testdata, "sherlock.txt"), b)
tok := NewIterTokenizer(
UsingSanitizer(strings.NewReplacer()), // Disable sanitizer
UsingPrefixes([]string{"(", `"`, "[", "'"}),
Expand Down
Loading

0 comments on commit faace3a

Please sign in to comment.