diff --git a/firehose/firehose.go b/firehose/firehose.go index bc2c4be..91848a3 100644 --- a/firehose/firehose.go +++ b/firehose/firehose.go @@ -37,10 +37,15 @@ const ( ) // We use all languages so as to reliably separate Norwegian from other European languages -var detector = lingua.NewLanguageDetectorBuilder().FromLanguages(lingua.AllLanguages()...).WithMinimumRelativeDistance(0.25).Build() +var detector lingua.LanguageDetector -// Keep track of processed event and posts count to show stats in the web interface +func InitDetector() { + if detector == nil { + detector = lingua.NewLanguageDetectorBuilder().FromLanguages(lingua.AllLanguages()...).WithMinimumRelativeDistance(0.25).Build() + } +} +// Keep track of processed event and posts count to show stats in the web interface var ( processedEvents int64 processedPosts int64 @@ -59,7 +64,7 @@ var feedPostPool = sync.Pool{ } // Add this helper function at package level -func hasEnoughNorwegianLetters(text string) bool { +func HasEnoughNorwegianLetters(text string) bool { if len(text) == 0 { return false } @@ -84,6 +89,8 @@ func hasEnoughNorwegianLetters(text string) bool { // Subscribe to the firehose using the Firehose struct as a receiver func Subscribe(ctx context.Context, postChan chan interface{}, ticker *time.Ticker, seq int64, detectFalseNegatives bool) { + InitDetector() + address := "wss://bsky.network/xrpc/com.atproto.sync.subscribeRepos" headers := http.Header{} headers.Set("User-Agent", "NorSky: https://github.com/snorremd/norsky") @@ -225,8 +232,8 @@ type PostProcessor struct { } // Move language detection logic to its own function -func (p *PostProcessor) detectNorwegianLanguage(text string, currentLangs []string) (bool, []string) { - if !hasEnoughNorwegianLetters(text) { +func (p *PostProcessor) DetectNorwegianLanguage(text string, currentLangs []string) (bool, []string) { + if !HasEnoughNorwegianLetters(text) { return false, currentLangs } @@ -254,15 +261,23 @@ func (p *PostProcessor) detectNorwegianLanguage(text string, currentLangs []stri func (p *PostProcessor) processPost(evt *atproto.SyncSubscribeRepos_Commit, op *atproto.SyncSubscribeRepos_RepoOp, record *appbsky.FeedPost) error { uri := fmt.Sprintf("at://%s/%s", evt.Repo, op.Path) + // Filter out posts tagged with other languages + if len(record.Langs) > 0 && !lo.Some(record.Langs, []string{"no", "nb", "nn", "se", "en"}) { + log.Debugf("Skipping post with languages: %v", record.Langs) + return nil + } + shouldProcess := false langs := record.Langs if p.detectFalseNegatives { - shouldProcess, langs = p.detectNorwegianLanguage(record.Text, record.Langs) + shouldProcess, langs = p.DetectNorwegianLanguage(record.Text, record.Langs) } else if lo.Some(record.Langs, []string{"no", "nb", "nn", "se"}) { - shouldProcess, langs = p.detectNorwegianLanguage(record.Text, record.Langs) + shouldProcess, langs = p.DetectNorwegianLanguage(record.Text, record.Langs) } + log.Infof("Should process: %t, langs: %v", shouldProcess, langs) + if !shouldProcess { return nil } @@ -360,3 +375,8 @@ func eventProcessor(postChan chan interface{}, context context.Context, ticker * }, } } + +// GetDetector returns the package-level detector for testing +func GetDetector() lingua.LanguageDetector { + return detector +} diff --git a/firehose/firehose_test.go b/firehose/firehose_test.go new file mode 100644 index 0000000..ebcdb9c --- /dev/null +++ b/firehose/firehose_test.go @@ -0,0 +1,59 @@ +package firehose_test + +import ( + "norsky/firehose" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestHasEnoughNorwegianLetters(t *testing.T) { + tests := []struct { + name string + text string + expected bool + }{ + { + name: "empty string", + text: "", + expected: false, + }, + { + name: "only special characters", + text: "!@#$%^&*()", + expected: false, + }, + { + name: "few letters", + text: "hi! :) 123456789", + expected: false, + }, + { + name: "enough regular letters", + text: "Dette er en normal norsk tekst", + expected: true, + }, + { + name: "enough letters with Norwegian characters", + text: "Blåbær og røde æbler på trærne", + expected: true, + }, + { + name: "mixed content with enough letters", + text: "Hei! 😊 Dette er en fin dag å være ute! 🌞", + expected: true, + }, + { + name: "mixed content with too few letters", + text: "Hi! 😊 🌞 123 !!! ???", + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := firehose.HasEnoughNorwegianLetters(tt.text) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/go.mod b/go.mod index c3e2970..c929bbd 100644 --- a/go.mod +++ b/go.mod @@ -16,13 +16,19 @@ require ( github.com/labstack/gommon v0.4.2 github.com/samber/lo v1.47.0 github.com/sirupsen/logrus v1.9.3 + github.com/stretchr/testify v1.9.0 github.com/strideynet/bsky-furry-feed v0.0.74 github.com/urfave/cli/v2 v2.27.5 github.com/valyala/fasthttp v1.57.0 golang.org/x/crypto/x509roots/fallback v0.0.0-20241107225453-6018723c7405 ) -require github.com/shopspring/decimal v1.3.1 // indirect +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/shopspring/decimal v1.3.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) require ( github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b // indirect