Skip to content

Commit

Permalink
feat: Filter out false-positives with lingua-go
Browse files Browse the repository at this point in the history
Sometimes people forget to switch back to English when posting. This
causes the feed to contain a lot of English language posts. We can use
the lingua-go library to help us filter out false positives. If the post
is tagged as no, nb, or nn we use lingua-go to detect if this is either
bokmål or nynorsk. If not we don't add it to the feed database.

This should be reasonably performant as we're only testing the posts
tagged as Norwegian. We are not testing Samii posts as lingua-go does
not have detection for that today.
  • Loading branch information
snorremd committed Nov 21, 2024
1 parent 75fb2f8 commit d54abd7
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 0 deletions.
22 changes: 22 additions & 0 deletions firehose/firehose.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,23 @@ import (
"github.com/bluesky-social/indigo/repomgr"
"github.com/cenkalti/backoff/v4"
"github.com/gorilla/websocket"
lingua "github.com/pemistahl/lingua-go"
"github.com/samber/lo"
log "github.com/sirupsen/logrus"
)

// Static list of languages to use for lingua-go language detection

var languages = []lingua.Language{
lingua.Bokmal,
lingua.Nynorsk,
}

var detector = lingua.NewLanguageDetectorBuilder().FromLanguages(languages...).Build()

// Subscribe to the firehose using the Firehose struct as a receiver
func Subscribe(ctx context.Context, postChan chan interface{}, ticker *time.Ticker, seq int64) {

address := "wss://bsky.network/xrpc/com.atproto.sync.subscribeRepos"
headers := http.Header{}
headers.Set("User-Agent", "NorSky: https://github.com/snorremd/norsky")
Expand Down Expand Up @@ -122,6 +133,17 @@ func eventProcessor(postChan chan interface{}, context context.Context, ticker *

// Contains any of the languages in the post that are one of the following: nb, nn, se
if lo.Some(post.Langs, []string{"no", "nb", "nn", "se"}) {

// If tagged as no, nb, nn we need to detect the language to weed out false positives
if lo.Some(post.Langs, []string{"no", "nb", "nn"}) {
// Detect language
_, exists := detector.DetectLanguageOf(post.Text)
if !exists {
log.Warn("Not norwegian, skipping")
continue
}
}

// Keep track of what commits we have processed
postChan <- models.ProcessSeqEvent{
Seq: evt.Seq,
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ require (
golang.org/x/crypto/x509roots/fallback v0.0.0-20241107225453-6018723c7405
)

require github.com/shopspring/decimal v1.3.1 // indirect

require (
github.com/RussellLuo/slidingwindow v0.0.0-20200528002341-535bb99d338b // indirect
github.com/andybalholm/brotli v1.1.1 // indirect
Expand Down Expand Up @@ -102,6 +104,7 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/ncruces/go-strftime v0.1.9 // indirect
github.com/opentracing/opentracing-go v1.2.0 // indirect
github.com/pemistahl/lingua-go v1.4.0
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 // indirect
github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect
github.com/polydawn/refmt v0.89.1-0.20221221234430-40501e09de1f // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@ github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdh
github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs=
github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc=
github.com/pemistahl/lingua-go v1.4.0 h1:ifYhthrlW7iO4icdubwlduYnmwU37V1sbNrwhKBR4rM=
github.com/pemistahl/lingua-go v1.4.0/go.mod h1:ECuM1Hp/3hvyh7k8aWSqNCPlTxLemFZsRjocUf3KgME=
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 h1:1/WtZae0yGtPq+TI6+Tv1WTxkukpXeMlviSxvL7SRgk=
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9/go.mod h1:x3N5drFsm2uilKKuuYo6LdyD8vZAW55sH/9w+pbo1sw=
github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c h1:dAMKvw0MlJT1GshSTtih8C2gDs04w8dReiOGXrGLNoY=
Expand Down Expand Up @@ -344,6 +346,8 @@ github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc=
github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU=
github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys=
github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs=
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
Expand Down

0 comments on commit d54abd7

Please sign in to comment.