Skip to content

Commit

Permalink
fix: Calculate lingua confidence to better filter posts
Browse files Browse the repository at this point in the history
Use lingua-go's ComputeLanguageConfidence for Bokmål and Nynorsk to see
how confident lingua-go is. If the confidence is below 0.6 we can safely
disregard the posts. Hopefully this increases accuracy.
  • Loading branch information
snorremd committed Dec 30, 2024
1 parent cad43db commit 92ae77c
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 12 deletions.
17 changes: 14 additions & 3 deletions cmd/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ func serveCmd() *cli.Command {
EnvVars: []string{"NORSKY_DETECT_FALSE_NEGATIVES"},
Value: false,
},
&cli.Float64Flag{
Name: "confidence-threshold",
Usage: "Minimum confidence threshold for language detection",
EnvVars: []string{"NORSKY_CONFIDENCE_THRESHOLD"},
Value: 0.6,
},
},

Action: func(ctx *cli.Context) error {
Expand All @@ -77,12 +83,17 @@ func serveCmd() *cli.Command {
hostname := ctx.String("hostname")
host := ctx.String("host")
port := ctx.Int("port")

confidenceThreshold := ctx.Float64("confidence-threshold")
detectFalseNegatives := ctx.Bool("detect-false-negatives")
// Check if any of the required flags are missing
if hostname == "" {
return errors.New("missing required flag: --hostname")
}

if confidenceThreshold < 0 || confidenceThreshold > 1.0 {
return errors.New("confidence-threshold must be between 0 and 1")
}

err := db.Migrate(database)

if err != nil {
Expand Down Expand Up @@ -154,7 +165,7 @@ func serveCmd() *cli.Command {
}
}()
fmt.Println("Subscribing to firehose...")
firehose.Subscribe(firehoseCtx, postChan, livenessTicker, seq, ctx.Bool("detect-false-negatives"))
firehose.Subscribe(firehoseCtx, postChan, livenessTicker, seq, detectFalseNegatives, confidenceThreshold)
}()

go func() {
Expand Down Expand Up @@ -202,7 +213,7 @@ func serveCmd() *cli.Command {
firehoseCtx = context.WithValue(firehoseCtx, cancelKey, cancel)

// Restart subscription in new goroutine
go firehose.Subscribe(firehoseCtx, postChan, livenessTicker, seq, ctx.Bool("detect-false-negatives"))
go firehose.Subscribe(firehoseCtx, postChan, livenessTicker, seq, ctx.Bool("detect-false-negatives"), ctx.Float64("confidence-threshold"))
}
}
}
Expand Down
22 changes: 21 additions & 1 deletion cmd/subscribe.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@ Returns each post as a JSON object on a single line. Use a tool like jq to proce
the output.
Prints all other log messages to stderr.`,
Flags: []cli.Flag{
&cli.BoolFlag{
Name: "detect-false-negatives",
Usage: "Detect false negatives in language detection",
EnvVars: []string{"NORSKY_DETECT_FALSE_NEGATIVES"},
},
&cli.Float64Flag{
Name: "confidence-threshold",
Usage: "Confidence threshold for language detection (0-1)",
EnvVars: []string{"NORSKY_CONFIDENCE_THRESHOLD"},
Value: 0.6,
},
},
Action: func(ctx *cli.Context) error {
// Get the context for this process to pass to firehose

Expand All @@ -43,7 +56,14 @@ Prints all other log messages to stderr.`,

go func() {
fmt.Println("Subscribing to firehose...")
firehose.Subscribe(ctx.Context, postChan, ticker, -1, ctx.Bool("detect-false-negatives"))
firehose.Subscribe(
ctx.Context,
postChan,
ticker,
-1,
ctx.Bool("detect-false-negatives"),
ctx.Float64("confidence-threshold"),
)
}()

go func() {
Expand Down
43 changes: 35 additions & 8 deletions firehose/firehose.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,12 @@ func containsSpamContent(text string) bool {
}

// Subscribe to the firehose using the Firehose struct as a receiver
func Subscribe(ctx context.Context, postChan chan interface{}, ticker *time.Ticker, seq int64, detectFalseNegatives bool) {
func Subscribe(ctx context.Context, postChan chan interface{}, ticker *time.Ticker, seq int64, detectFalseNegatives bool, confidenceThreshold float64) {
// Validate confidence threshold
if confidenceThreshold < 0 || confidenceThreshold > 1 {
log.Warnf("Invalid confidence threshold %f, using default 0.6", confidenceThreshold)
confidenceThreshold = 0.6
}

InitDetector()

Expand Down Expand Up @@ -267,7 +272,7 @@ func Subscribe(ctx context.Context, postChan chan interface{}, ticker *time.Tick
ThroughputBucketCount: 10,
},
conn.RemoteAddr().String(),
eventProcessor(postChan, ctx, ticker, detectFalseNegatives).EventHandler)
eventProcessor(postChan, ctx, ticker, detectFalseNegatives, confidenceThreshold).EventHandler)
err = events.HandleRepoStream(ctx, conn, scheduler)

// If error sleep
Expand Down Expand Up @@ -319,6 +324,7 @@ type PostProcessor struct {
context context.Context
ticker *time.Ticker
detectFalseNegatives bool
confidenceThreshold float64
}

// Move language detection logic to its own function
Expand All @@ -327,8 +333,29 @@ func (p *PostProcessor) DetectNorwegianLanguage(text string, currentLangs []stri
return false, currentLangs
}

lang, exists := detector.DetectLanguageOf(text)
if !exists || lang == lingua.English || (lang != lingua.Bokmal && lang != lingua.Nynorsk) {
// If more than 30% of words are hashtags, skip language detection
words := strings.Fields(text)
if len(words) > 0 {
hashtagCount := strings.Count(text, "#")
hashtagRatio := float64(hashtagCount) / float64(len(words))
if hashtagRatio > 0.3 {
return false, currentLangs
}
}

detectedLang, exists := detector.DetectLanguageOf(text)
if !exists || detectedLang == lingua.English || (detectedLang != lingua.Bokmal && detectedLang != lingua.Nynorsk) {
return false, currentLangs
}

// Get confidence scores for norwegian languages between 0 and 1
bokmalConf := detector.ComputeLanguageConfidence(text, lingua.Bokmal)
nynorskConf := detector.ComputeLanguageConfidence(text, lingua.Nynorsk)

log.Infof("Bokmal confidence: %.2f, Nynorsk confidence: %.2f (threshold: %.2f)",
bokmalConf, nynorskConf, p.confidenceThreshold)

if bokmalConf < p.confidenceThreshold && nynorskConf < p.confidenceThreshold {
return false, currentLangs
}

Expand All @@ -337,13 +364,12 @@ func (p *PostProcessor) DetectNorwegianLanguage(text string, currentLangs []stri
copy(updatedLangs, currentLangs)

// Add detected language if not present
if lang == lingua.Bokmal && !lo.Contains(updatedLangs, "nb") {
if detectedLang == lingua.Bokmal && !lo.Contains(updatedLangs, "nb") {
updatedLangs = append(updatedLangs, "nb")
} else if lang == lingua.Nynorsk && !lo.Contains(updatedLangs, "nn") {
} else if detectedLang == lingua.Nynorsk && !lo.Contains(updatedLangs, "nn") {
updatedLangs = append(updatedLangs, "nn")
}

log.Infof("Detected language: %s for post tagged as %s: %s", lang.String(), currentLangs, text)
return true, updatedLangs
}

Expand Down Expand Up @@ -403,12 +429,13 @@ func (p *PostProcessor) processPost(evt *atproto.SyncSubscribeRepos_Commit, op *
}

// Main event processor function is now more focused
func eventProcessor(postChan chan interface{}, context context.Context, ticker *time.Ticker, detectFalseNegatives bool) *events.RepoStreamCallbacks {
func eventProcessor(postChan chan interface{}, context context.Context, ticker *time.Ticker, detectFalseNegatives bool, confidenceThreshold float64) *events.RepoStreamCallbacks {
processor := &PostProcessor{
postChan: postChan,
context: context,
ticker: ticker,
detectFalseNegatives: detectFalseNegatives,
confidenceThreshold: confidenceThreshold,
}

return &events.RepoStreamCallbacks{
Expand Down

0 comments on commit 92ae77c

Please sign in to comment.