Skip to content

Commit

Permalink
fix: Filter out english if over confidence threshold
Browse files Browse the repository at this point in the history
English has a tendency of sneaking into the language feeds. If english
is not one of the target languages check if english confidence threshold
is high enough that we aught to filter it out.
  • Loading branch information
snorremd committed Jan 18, 2025
1 parent 395fbcb commit b47f238
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions firehose/firehose.go
Original file line number Diff line number Diff line change
Expand Up @@ -440,20 +440,28 @@ type PostProcessor struct {

// Rename to DetectLanguage since it's no longer Norwegian-specific
func (p *PostProcessor) DetectLanguage(text string, currentLangs []string, targetLangs []lingua.Language) (bool, []string) {
// First check English confidence separately
englishConf := p.languageDetector.ComputeLanguageConfidence(text, lingua.English)

// If text is primarily English (high confidence), skip it unless English is a target language
if englishConf > 0.8 && !lo.Contains(targetLangs, lingua.English) {
return false, currentLangs
}

var highestConf float64
var detectedLang lingua.Language

// Check confidence for English and all target languages
for _, lang := range append([]lingua.Language{lingua.English}, targetLangs...) {
// Only check target languages
for _, lang := range targetLangs {
conf := p.languageDetector.ComputeLanguageConfidence(text, lang)
if conf > highestConf {
highestConf = conf
detectedLang = lang
}
}

// If confidence is too low or detected language is English, skip
if highestConf < p.config.ConfidenceThreshold || detectedLang == lingua.English {
// If confidence is too low, skip
if highestConf < p.config.ConfidenceThreshold {
return false, currentLangs
}

Expand Down

0 comments on commit b47f238

Please sign in to comment.