diff --git a/firehose/firehose.go b/firehose/firehose.go index 6ff43f8..51fbf69 100644 --- a/firehose/firehose.go +++ b/firehose/firehose.go @@ -19,12 +19,23 @@ import ( "github.com/bluesky-social/indigo/repomgr" "github.com/cenkalti/backoff/v4" "github.com/gorilla/websocket" + lingua "github.com/pemistahl/lingua-go" "github.com/samber/lo" log "github.com/sirupsen/logrus" ) +// Static list of languages to use for lingua-go language detection + +var languages = []lingua.Language{ + lingua.Bokmal, + lingua.Nynorsk, +} + +var detector = lingua.NewLanguageDetectorBuilder().FromLanguages(languages...).Build() + // Subscribe to the firehose using the Firehose struct as a receiver func Subscribe(ctx context.Context, postChan chan interface{}, ticker *time.Ticker, seq int64) { + address := "wss://bsky.network/xrpc/com.atproto.sync.subscribeRepos" headers := http.Header{} headers.Set("User-Agent", "NorSky: https://github.com/snorremd/norsky") @@ -122,6 +133,17 @@ func eventProcessor(postChan chan interface{}, context context.Context, ticker * // Contains any of the languages in the post that are one of the following: nb, nn, se if lo.Some(post.Langs, []string{"no", "nb", "nn", "se"}) { + + // If tagged as no, nb, nn we need to detect the language to weed out false positives + if lo.Some(post.Langs, []string{"no", "nb", "nn"}) { + // Detect language + _, exists := detector.DetectLanguageOf(post.Text) + if !exists { + log.Warn("Not norwegian, skipping") + continue + } + } + // Keep track of what commits we have processed postChan <- models.ProcessSeqEvent{ Seq: evt.Seq, diff --git a/go.mod b/go.mod index b5db14c..c3e2970 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,8 @@ require ( golang.org/x/crypto/x509roots/fallback v0.0.0-20241107225453-6018723c7405 ) +require github.com/shopspring/decimal v1.3.1 // indirect + require ( github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b // indirect github.com/andybalholm/brotli v1.1.1 // indirect @@ -102,6 +104,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/ncruces/go-strftime v0.1.9 // indirect github.com/opentracing/opentracing-go v1.2.0 // indirect + github.com/pemistahl/lingua-go v1.4.0 github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 // indirect github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect github.com/polydawn/refmt v0.89.1-0.20221221234430-40501e09de1f // indirect diff --git a/go.sum b/go.sum index a93fb10..1787f29 100644 --- a/go.sum +++ b/go.sum @@ -312,6 +312,8 @@ github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdh github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= +github.com/pemistahl/lingua-go v1.4.0 h1:ifYhthrlW7iO4icdubwlduYnmwU37V1sbNrwhKBR4rM= +github.com/pemistahl/lingua-go v1.4.0/go.mod h1:ECuM1Hp/3hvyh7k8aWSqNCPlTxLemFZsRjocUf3KgME= github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 h1:1/WtZae0yGtPq+TI6+Tv1WTxkukpXeMlviSxvL7SRgk= github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9/go.mod h1:x3N5drFsm2uilKKuuYo6LdyD8vZAW55sH/9w+pbo1sw= github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c h1:dAMKvw0MlJT1GshSTtih8C2gDs04w8dReiOGXrGLNoY= @@ -344,6 +346,8 @@ github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc= github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU= github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys= github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs= +github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8= +github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=