Merge pull request #1 from zuvaai/improve-speed-and-structure

Improve overall speed and structure of Named Entity Recognition and Part of Speech tagging
zuvaai · Mar 9, 2023 · faace3a · faace3a
2 parents a7338b5 + e0de1d2
commit faace3a
Show file tree

Hide file tree

Showing 19 changed files with 491 additions and 668 deletions.
diff --git a/data.go b/data.go
diff --git a/document.go b/document.go
@@ -1,17 +1,19 @@
 package prose
 
+import "fmt"
+
 // A DocOpt represents a setting that changes the document creation process.
 //
 // For example, it might disable named-entity extraction:
 //
-//    doc := prose.NewDocument("...", prose.WithExtraction(false))
+//	doc := prose.NewDocument("...", prose.WithExtraction(false))
 type DocOpt func(doc *Document, opts *DocOpts)
 
 // DocOpts controls the Document creation process:
 type DocOpts struct {
-	Extract  bool // If true, include named-entity extraction
-	Segment  bool // If true, include segmentation
-	Tag      bool // If true, include POS tagging
+	Extract   bool      // If true, include named-entity extraction
+	Segment   bool      // If true, include segmentation
+	Tag       bool      // If true, include POS tagging
 	Tokenizer Tokenizer // If true, include tokenization
 }
 
@@ -93,16 +95,16 @@ func (doc *Document) Entities() []Entity {
 
 var defaultOpts = DocOpts{
 	Tokenizer: NewIterTokenizer(),
-	Segment:  true,
-	Tag:      true,
-	Extract:  true,
+	Segment:   true,
+	Tag:       true,
+	Extract:   true,
 }
 
 // NewDocument creates a Document according to the user-specified options.
 //
 // For example,
 //
-//    doc := prose.NewDocument("...")
+//	doc := prose.NewDocument("...")
 func NewDocument(text string, opts ...DocOpt) (*Document, error) {
 	var pipeError error
 
@@ -113,18 +115,24 @@ func NewDocument(text string, opts ...DocOpt) (*Document, error) {
 	}
 
 	if doc.Model == nil {
-		doc.Model = defaultModel(base.Tag, base.Extract)
+		doc.Model, pipeError = defaultModel(base.Tag, base.Extract)
+		if pipeError != nil {
+			return nil, fmt.Errorf("unable to load default model: %w", pipeError)
+		}
 	}
 
 	if base.Segment {
-		segmenter := newPunktSentenceTokenizer()
+		segmenter, err := newPunktSentenceTokenizer()
+		if err != nil {
+			return nil, fmt.Errorf("unable to create punkt segmenter: %w", err)
+		}
 		doc.sentences = segmenter.segment(text)
 	}
 	if base.Tokenizer != nil {
 		doc.tokens = append(doc.tokens, base.Tokenizer.Tokenize(text)...)
 	}
 	if base.Tag || base.Extract {
-		doc.tokens = doc.Model.tagger.tag(doc.tokens)
+		doc.tokens = doc.Model.tagger.Tag(doc.tokens)
 	}
 	if base.Extract {
 		doc.tokens = doc.Model.extracter.classify(doc.tokens)

diff --git a/document_test.go b/document_test.go
@@ -7,7 +7,7 @@ import (
 )
 
 func BenchmarkDoc(b *testing.B) {
-	content := readDataFile(filepath.Join(testdata, "sherlock.txt"))
+	content := readDataFile(filepath.Join(testdata, "sherlock.txt"), b)
 	text := string(content)
 	for n := 0; n < b.N; n++ {
 		_, err := NewDocument(text)
@@ -18,7 +18,7 @@ func BenchmarkDoc(b *testing.B) {
 }
 
 func BenchmarkCustomTokenizer(b *testing.B) {
-	content := readDataFile(filepath.Join(testdata, "sherlock.txt"))
+	content := readDataFile(filepath.Join(testdata, "sherlock.txt"), b)
 	tok := NewIterTokenizer(
 		UsingSanitizer(strings.NewReplacer()), // Disable sanitizer
 		UsingPrefixes([]string{"(", `"`, "[", "'"}),