Skip to content

Commit

Permalink
Merge pull request #20 from heussd/feature/keywords-file-in-the-cloud
Browse files Browse the repository at this point in the history
Implement cloud-based keywords file which will be retrieved during op…
  • Loading branch information
heussd authored Jan 9, 2025
2 parents 898e0b1 + 12ad04f commit 2eb2d2a
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 70 deletions.
5 changes: 3 additions & 2 deletions keyword-matcher-go/internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ package config

import (
"fmt"
"github.com/nats-io/nats.go"
"os"

"github.com/nats-io/nats.go"
)

var (
KeywordsFile = GetEnv("KEYWORDS_FILE", "keywords.txt")
KeywordsFileUrl = GetEnv("KEYWORDS_FILE_URL", "https://raw.githubusercontent.com/heussd/nats-news-analysis/refs/heads/main/keyword-matcher-go/internal/keywords/keywords.txt")
FullTextRssServer = GetEnv("FULLTEXTRSS_SERVER", "http://localhost:80")
NatsServer = GetEnv("NATS_SERVER", nats.DefaultURL)
NatsInputQueueName = GetEnv("NATS_INPUT_QUEUE_NAME", "article-urls")
Expand Down
72 changes: 10 additions & 62 deletions keyword-matcher-go/internal/keywords/keywords.go
Original file line number Diff line number Diff line change
@@ -1,77 +1,25 @@
package keywords

import (
"bufio"
"fmt"
"github.com/dlclark/regexp2"
"github.com/heussd/nats-news-keyword-matcher.go/internal/config"
"os"
"strings"
)

type KeywordEntry struct {
regexp regexp2.Regexp
id string
text string
}

var cleanUpRegexes = []regexp2.Regexp{
*regexp2.MustCompile("[^a-zA-Z]", 0),
*regexp2.MustCompile("\\s\\S\\s", 0),
*regexp2.MustCompile("\\s\\s+", 0),
}

var keywords []KeywordEntry

func init() {
readFile, err := os.Open(config.KeywordsFile)
if err != nil {
panic(err)
}

fileScanner := bufio.NewScanner(readFile)
fileScanner.Split(bufio.ScanLines)

for fileScanner.Scan() {
var text = fileScanner.Text()

if text == "" ||
strings.HasPrefix(text, "#") {
continue
}

fmt.Printf("Parsing \"%s\" as regex\n", text)
"github.com/heussd/nats-news-keyword-matcher.go/internal/model"
"github.com/heussd/nats-news-keyword-matcher.go/pkg/cloudtextfile"
)

var regex = regexp2.MustCompile(text, 0)
keywords = append(keywords, KeywordEntry{
regexp: *regex,
id: humanReadable(text),
text: text,
})
}
func Match(s string) (bool, string) {
var keywords []model.KeywordEntry
var err error

if len(keywords) == 0 {
fmt.Println("Error: No keywords found")
if keywords, err = cloudtextfile.CachedParsedKeywords(); err != nil {
fmt.Println(err)
os.Exit(1)
}
}

func humanReadable(regex string) string {
var s = regex
var err error
for _, r := range cleanUpRegexes {
if s, err = r.Replace(s, " ", 0, -1); err != nil {
panic(err)
}
}

return strings.TrimSpace(s)
}

func Match(s string) (bool, string) {
for _, v := range keywords {
if match, _ := v.regexp.MatchString(s); match {
return true, v.id
if match, _ := v.Regexp.MatchString(s); match {
return true, v.Id
}
}
return false, ""
Expand Down
8 changes: 2 additions & 6 deletions keyword-matcher-go/internal/keywords/keywords_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package keywords

import (
"github.com/stretchr/testify/assert"
"testing"

"github.com/stretchr/testify/assert"
)

func first(flag bool, _ string) bool {
Expand Down Expand Up @@ -47,11 +48,6 @@ func TestLocalIT(t *testing.T) {

}

func TestHumanReadable(t *testing.T) {
assert.Equal(t, "delicious pie recipes", humanReadable("(?i)(delicious).*(pie|recipes)"))

}

func TestStringMatchReturn(t *testing.T) {
_, text := Match("A little Peach a day")
assert.Equal(t, "Apple peach", text)
Expand Down
9 changes: 9 additions & 0 deletions keyword-matcher-go/internal/model/KeywordEntry.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package model

import "github.com/dlclark/regexp2"

type KeywordEntry struct {
Regexp regexp2.Regexp
Id string
Text string
}
96 changes: 96 additions & 0 deletions keyword-matcher-go/pkg/cloudtextfile/cloudtextfile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package cloudtextfile

import (
"fmt"
"io"
"net/http"
"strings"
"time"

"github.com/dlclark/regexp2"
"github.com/heussd/nats-news-keyword-matcher.go/internal/config"
"github.com/heussd/nats-news-keyword-matcher.go/internal/model"
)

var (
cachedKeywords []model.KeywordEntry
lastGenerated time.Time
cacheDuration = 20 * time.Minute
)

func RetrieveKeywordsFile() (keywords []string, err error) {
client := &http.Client{}
var req *http.Request

if req, err = http.NewRequest("GET", config.KeywordsFileUrl, nil); err != nil {
return nil, err
}

var response *http.Response

if response, err = client.Do(req); err != nil {
return nil, err
}

if status := response.StatusCode; status != 200 {
return nil, fmt.Errorf("failed to retrieve keywords file: status code %d", status)
}

var body []byte
if body, err = io.ReadAll(response.Body); err != nil {
return nil, err
}

keywords = strings.Split(string(body), "\n")

return keywords, nil
}

func humanReadable(regex string) string {
var s = regex
var err error
for _, r := range cleanUpRegexes {
if s, err = r.Replace(s, " ", 0, -1); err != nil {
panic(err)
}
}

return strings.TrimSpace(s)
}

var cleanUpRegexes = []regexp2.Regexp{
*regexp2.MustCompile("[^a-zA-Z]", 0),
*regexp2.MustCompile("\\s\\S\\s", 0),
*regexp2.MustCompile("\\s\\s+", 0),
}

func CachedParsedKeywords() (keywords []model.KeywordEntry, err error) {
if time.Since(lastGenerated) > cacheDuration {

keywords = []model.KeywordEntry{}

var plainKeywords []string
if plainKeywords, err = RetrieveKeywordsFile(); err != nil {
return nil, err
}

for _, text := range plainKeywords {
if text == "" || strings.HasPrefix(text, "#") {
continue
}

fmt.Printf("Parsing \"%s\" as regex\n", text)

var regex = regexp2.MustCompile(text, 0)
keywords = append(keywords, model.KeywordEntry{
Regexp: *regex,
Id: humanReadable(text),
Text: text,
})
}
cachedKeywords = keywords
lastGenerated = time.Now()
}

return cachedKeywords, nil
}
56 changes: 56 additions & 0 deletions keyword-matcher-go/pkg/cloudtextfile/cloudtextfile_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package cloudtextfile

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
)

func Test(t *testing.T) {
var keywords []string
var err error
if keywords, err = RetrieveKeywordsFile(); err != nil {
t.Error(err)
}

assert.Equal(t,
12,
len(keywords))

assert.Equal(t,
"(?i)\\b(Apple|peach)",
keywords[0])
}

func TestCache(t *testing.T) {
// First call to populate the cache
keywords, err := CachedParsedKeywords()
if err != nil {
t.Error(err)
}

assert.NotNil(t, keywords)
assert.Greater(t, len(keywords), 0)

// Store the time of the first cache generation
firstGenerated := lastGenerated

// Wait for a short duration and call again
time.Sleep(1 * time.Second)
keywords, err = CachedParsedKeywords()
if err != nil {
t.Error(err)
}

// Ensure the cache is still valid and hasn't been regenerated
assert.Equal(t, firstGenerated, lastGenerated)
assert.NotNil(t, keywords)
assert.Greater(t, len(keywords), 0)

}

func TestHumanReadable(t *testing.T) {
assert.Equal(t, "delicious pie recipes", humanReadable("(?i)(delicious).*(pie|recipes)"))

}

0 comments on commit 2eb2d2a

Please sign in to comment.