Skip to content

Commit

Permalink
fix: Filter down posts not tagged as en, no, nb, nn, se or empty
Browse files Browse the repository at this point in the history
We don't want posts tagged as other languages
  • Loading branch information
snorremd committed Dec 29, 2024
1 parent 37f85b3 commit 4da7688
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 8 deletions.
34 changes: 27 additions & 7 deletions firehose/firehose.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,15 @@ const (
)

// We use all languages so as to reliably separate Norwegian from other European languages
var detector = lingua.NewLanguageDetectorBuilder().FromLanguages(lingua.AllLanguages()...).WithMinimumRelativeDistance(0.25).Build()
var detector lingua.LanguageDetector

// Keep track of processed event and posts count to show stats in the web interface
func InitDetector() {
if detector == nil {
detector = lingua.NewLanguageDetectorBuilder().FromLanguages(lingua.AllLanguages()...).WithMinimumRelativeDistance(0.25).Build()
}
}

// Keep track of processed event and posts count to show stats in the web interface
var (
processedEvents int64
processedPosts int64
Expand All @@ -59,7 +64,7 @@ var feedPostPool = sync.Pool{
}

// Add this helper function at package level
func hasEnoughNorwegianLetters(text string) bool {
func HasEnoughNorwegianLetters(text string) bool {
if len(text) == 0 {
return false
}
Expand All @@ -84,6 +89,8 @@ func hasEnoughNorwegianLetters(text string) bool {
// Subscribe to the firehose using the Firehose struct as a receiver
func Subscribe(ctx context.Context, postChan chan interface{}, ticker *time.Ticker, seq int64, detectFalseNegatives bool) {

InitDetector()

address := "wss://bsky.network/xrpc/com.atproto.sync.subscribeRepos"
headers := http.Header{}
headers.Set("User-Agent", "NorSky: https://github.com/snorremd/norsky")
Expand Down Expand Up @@ -225,8 +232,8 @@ type PostProcessor struct {
}

// Move language detection logic to its own function
func (p *PostProcessor) detectNorwegianLanguage(text string, currentLangs []string) (bool, []string) {
if !hasEnoughNorwegianLetters(text) {
func (p *PostProcessor) DetectNorwegianLanguage(text string, currentLangs []string) (bool, []string) {
if !HasEnoughNorwegianLetters(text) {
return false, currentLangs
}

Expand Down Expand Up @@ -254,15 +261,23 @@ func (p *PostProcessor) detectNorwegianLanguage(text string, currentLangs []stri
func (p *PostProcessor) processPost(evt *atproto.SyncSubscribeRepos_Commit, op *atproto.SyncSubscribeRepos_RepoOp, record *appbsky.FeedPost) error {
uri := fmt.Sprintf("at://%s/%s", evt.Repo, op.Path)

// Filter out posts tagged with other languages
if len(record.Langs) > 0 && !lo.Some(record.Langs, []string{"no", "nb", "nn", "se", "en"}) {
log.Debugf("Skipping post with languages: %v", record.Langs)
return nil
}

shouldProcess := false
langs := record.Langs

if p.detectFalseNegatives {
shouldProcess, langs = p.detectNorwegianLanguage(record.Text, record.Langs)
shouldProcess, langs = p.DetectNorwegianLanguage(record.Text, record.Langs)
} else if lo.Some(record.Langs, []string{"no", "nb", "nn", "se"}) {
shouldProcess, langs = p.detectNorwegianLanguage(record.Text, record.Langs)
shouldProcess, langs = p.DetectNorwegianLanguage(record.Text, record.Langs)
}

log.Infof("Should process: %t, langs: %v", shouldProcess, langs)

if !shouldProcess {
return nil
}
Expand Down Expand Up @@ -360,3 +375,8 @@ func eventProcessor(postChan chan interface{}, context context.Context, ticker *
},
}
}

// GetDetector returns the package-level detector for testing
func GetDetector() lingua.LanguageDetector {
return detector
}
59 changes: 59 additions & 0 deletions firehose/firehose_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package firehose_test

import (
"norsky/firehose"
"testing"

"github.com/stretchr/testify/assert"
)

func TestHasEnoughNorwegianLetters(t *testing.T) {
tests := []struct {
name string
text string
expected bool
}{
{
name: "empty string",
text: "",
expected: false,
},
{
name: "only special characters",
text: "!@#$%^&*()",
expected: false,
},
{
name: "few letters",
text: "hi! :) 123456789",
expected: false,
},
{
name: "enough regular letters",
text: "Dette er en normal norsk tekst",
expected: true,
},
{
name: "enough letters with Norwegian characters",
text: "Blåbær og røde æbler på trærne",
expected: true,
},
{
name: "mixed content with enough letters",
text: "Hei! 😊 Dette er en fin dag å være ute! 🌞",
expected: true,
},
{
name: "mixed content with too few letters",
text: "Hi! 😊 🌞 123 !!! ???",
expected: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := firehose.HasEnoughNorwegianLetters(tt.text)
assert.Equal(t, tt.expected, result)
})
}
}
8 changes: 7 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@ require (
github.com/labstack/gommon v0.4.2
github.com/samber/lo v1.47.0
github.com/sirupsen/logrus v1.9.3
github.com/stretchr/testify v1.9.0
github.com/strideynet/bsky-furry-feed v0.0.74
github.com/urfave/cli/v2 v2.27.5
github.com/valyala/fasthttp v1.57.0
golang.org/x/crypto/x509roots/fallback v0.0.0-20241107225453-6018723c7405
)

require github.com/shopspring/decimal v1.3.1 // indirect
require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/shopspring/decimal v1.3.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

require (
github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b // indirect
Expand Down

0 comments on commit 4da7688

Please sign in to comment.