Skip to content

Commit

Permalink
Merge pull request #9 from itsmontoya/implement-circular-buffer-for-c…
Browse files Browse the repository at this point in the history
…haracter-ngrams

Implement circular buffer for character ngrams
  • Loading branch information
itsmontoya authored Jul 16, 2024
2 parents ba52042 + 4525fc6 commit 796895a
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 48 deletions.
2 changes: 1 addition & 1 deletion bag.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func (b *Bag) toNGrams(in string) (ns []string) {
return toNGrams(in, b.c.NGramSize)
}

return tocharacterNGrams(in, b.c.NGramSize)
return toCharacterNGrams(in, b.c.NGramSize)
}

// getProbability uses a Naive Bayes classifier to determine probability for a given label
Expand Down
61 changes: 17 additions & 44 deletions characterngram.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,60 +2,48 @@ package bag

import "bytes"

// tocharacterNGrams will convert inbound data to an characterNGram of provided size
func tocharacterNGrams(in string, size int) (ns []string) {
// toCharacterNGrams will convert inbound data to an characterNGram of provided size
func toCharacterNGrams(in string, size int) (ns []string) {
var c characterNGram
// Initialize characterNGram with a provided size
n := make(characterNGram, size)
c.circularBuffer = newCircularBuffer[rune](size)
// Iterate inbound data as words
toCharacters(in, func(char rune) {
// Append word to characterNGram
n = n.Append(char)
if !n.IsFull() {
c.Shift(char)
if !c.IsFull() {
// characterNGram is not full - we do not want to append yet, return
return
}

// Append current characterNGram to characterNGrams slice
ns = append(ns, n.String())
ns = append(ns, c.String())
})

if !n.IsFull() && !n.IsZero() {
if !c.IsFull() && !c.IsZero() {
// The characterNGram is not full, so we haven't appended yet
// The characterNGram is not empty, so we have something to append
// Append current characterNGram to characterNGrams slice
ns = append(ns, n.String())
ns = append(ns, c.String())
}

return
}

// characterNGram represents an characterNGram (variable sized)
type characterNGram []rune

// Append will append a given string to an characterNGram and output the new value
// Note: The original characterNGram is NOT modified
func (n characterNGram) Append(char rune) (out characterNGram) {
// Initialize new characterNGram with the same size as the original characterNGram
out = make(characterNGram, len(n))
// Iterate through original characterNGram, starting at index 1
for i := 1; i < len(n); i++ {
// Set the value of the current original characterNGram index as the value for the previous index for the output characterNGram
out[i-1] = n[i]
}

// Set the last value of the output characterNGram as the input string
out[len(n)-1] = char
return
type characterNGram struct {
*circularBuffer[rune]
}

// String will convert the characterNGram contents to a string
func (n characterNGram) String() (out string) {
// Initialize buffer
buf := bytes.NewBuffer(nil)
// Iterate through characterNGram values
n.iterate(func(char rune) {
n.ForEach(func(char rune) (end bool) {
// Write value to buffer
buf.WriteRune(char)
return
})

// Return buffer as string
Expand All @@ -64,27 +52,12 @@ func (n characterNGram) String() (out string) {

// IsZero returns whether or not the characterNGram is empty
func (n characterNGram) IsZero() bool {
// Return result of if the value in the last position is empty
return n[len(n)-1] == 0
// Return result of if the value in the first position is populated
return n.s[0] == 0
}

// IsFull returns whether or not the characterNGram is full
func (n characterNGram) IsFull() bool {
// Return result of if the value in the first position is populated
return n[0] != 0
}

// iterate will iterate through the characterNGram values
func (n characterNGram) iterate(fn func(char rune)) {
// Iterate through characterNGram values
for _, char := range n {
// Check if value is empty
if char == 0 {
// Value is empty, continue
continue
}

// Value is populated, pass to provided func
fn(char)
}
// Return result of if the value in the last position is empty
return n.s[len(n.s)-1] > 0
}
6 changes: 3 additions & 3 deletions characterngram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"testing"
)

func Test_tocharacterNGrams(t *testing.T) {
func Test_toCharacterNGrams(t *testing.T) {
type args struct {
in string
size int
Expand Down Expand Up @@ -61,8 +61,8 @@ func Test_tocharacterNGrams(t *testing.T) {

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if gotNs := tocharacterNGrams(tt.args.in, tt.args.size); !reflect.DeepEqual(gotNs, tt.wantNs) {
t.Errorf("tocharacterNGrams() = \n%v\n, want \n%v", gotNs, tt.wantNs)
if gotNs := toCharacterNGrams(tt.args.in, tt.args.size); !reflect.DeepEqual(gotNs, tt.wantNs) {
t.Errorf("toCharacterNGrams() = \n%v\n, want \n%v", gotNs, tt.wantNs)
}
})
}
Expand Down
36 changes: 36 additions & 0 deletions circularbuffer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,39 @@ func Test_circularBuffer_Len(t *testing.T) {
})
}
}

func Benchmark_toCharacterNGrams(b *testing.B) {
type args struct {
in string
size int
}

type testcase struct {
name string
args args
wantNs []string
}

tests := []testcase{
{
name: "basic",
args: args{
in: "hello world! This is really cool, wowo",
size: 3,
},
wantNs: []string{
"hello world this",
"world this is",
"this is really",
"is really cool",
"really cool wowo",
},
},
}

for i := 0; i < b.N; i++ {
for _, tc := range tests {
ngramsSink = toCharacterNGrams(tc.args.in, tc.args.size)
}
}
}

0 comments on commit 796895a

Please sign in to comment.