Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] - cache duplicate creds during detection #2276

Draft
wants to merge 32 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
f2f62ab
Extend memory cache to allow for configuring custom expiration and pu…
ahrav Jan 7, 2024
9c016d7
use any for value type
ahrav Jan 7, 2024
e6685c7
fix test
ahrav Jan 7, 2024
6a4eabd
fix test
ahrav Jan 7, 2024
5dc03be
cache results to prevent multiple network calls using the same creds
ahrav Jan 7, 2024
82348f9
use custom value
ahrav Jan 7, 2024
39b1642
rename field
ahrav Jan 7, 2024
4af9af3
use ptr for the cacheItem
ahrav Jan 7, 2024
e4b465d
use constructor
ahrav Jan 8, 2024
dd9b6ea
remove nil check
ahrav Jan 8, 2024
f45bb31
add method to popualte result from the cache item
ahrav Jan 8, 2024
f286c33
update comment
ahrav Jan 8, 2024
c4162e2
missed a ptr
ahrav Jan 8, 2024
318eebb
add test
ahrav Jan 8, 2024
c1a330d
rename
ahrav Jan 8, 2024
8ceef69
address comments
ahrav Jan 8, 2024
5609ca3
address
ahrav Jan 8, 2024
47916e3
make new construct more clear
ahrav Jan 8, 2024
32b75c2
cache results to prevent multiple network calls using the same creds
ahrav Jan 7, 2024
5605ad0
use custom value
ahrav Jan 7, 2024
6c6d58e
rename field
ahrav Jan 7, 2024
72c9722
use ptr for the cacheItem
ahrav Jan 7, 2024
eee751f
use constructor
ahrav Jan 8, 2024
a08343f
remove nil check
ahrav Jan 8, 2024
0a4155b
add method to popualte result from the cache item
ahrav Jan 8, 2024
dac2eb1
update comment
ahrav Jan 8, 2024
a8703a2
missed a ptr
ahrav Jan 8, 2024
06f99da
add test
ahrav Jan 8, 2024
a586b5f
rename
ahrav Jan 8, 2024
6199dee
Merge branch 'feat-cache-dupe-creds' of github.com:trufflesecurity/tr…
ahrav Jan 8, 2024
fae7617
merge main
ahrav Jan 11, 2024
5faf556
merge main
ahrav Jan 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pkg/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ package cache
// Cache is used to store key/value pairs.
type Cache interface {
// Set stores the given key/value pair.
Set(string, string)
Set(string, any)
// Get returns the value for the given key and a boolean indicating if the key was found.
Get(string) (string, bool)
Get(string) (any, bool)
// Exists returns true if the given key exists in the cache.
Exists(string) bool
// Delete the given key from the cache.
Expand All @@ -18,7 +18,7 @@ type Cache interface {
// Keys returns all keys in the cache.
Keys() []string
// Values returns all values in the cache.
Values() []string
Values() []any
// Contents returns all keys in the cache encoded as a string.
Contents() string
}
82 changes: 59 additions & 23 deletions pkg/cache/memory/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,47 +10,83 @@ import (
)

const (
expirationInterval = 12 * time.Hour
purgeInterval = 13 * time.Hour
defaultExpiration = cache.DefaultExpiration
defaultExpirationInterval = 12 * time.Hour
defaultPurgeInterval = 13 * time.Hour
defaultExpiration = cache.DefaultExpiration
)

// Cache is a wrapper around the go-cache library.
// Cache wraps the go-cache library to provide an in-memory key-value store.
type Cache struct {
c *cache.Cache
c *cache.Cache
expiration time.Duration
purgeInterval time.Duration
}

// New constructs a new in-memory cache.
func New() *Cache {
c := cache.New(expirationInterval, purgeInterval)
return &Cache{c: c}
// CacheOption defines a function type used for configuring a Cache.
type CacheOption func(*Cache)

// WithExpirationInterval returns a CacheOption to set the expiration interval of cache items.
// The interval determines the duration a cached item remains in the cache before it is expired.
func WithExpirationInterval(interval time.Duration) CacheOption {
return func(c *Cache) { c.expiration = interval }
}

// WithPurgeInterval returns a CacheOption to set the interval at which the cache purges expired items.
// Regular purging helps in freeing up memory by removing stale entries.
func WithPurgeInterval(interval time.Duration) CacheOption {
return func(c *Cache) { c.purgeInterval = interval }
}

// New constructs a new in-memory cache instance with optional configurations.
// By default, it sets the expiration and purge intervals to 12 and 13 hours, respectively.
// These defaults can be overridden using the functional options: WithExpirationInterval and WithPurgeInterval.
func New(opts ...CacheOption) *Cache {
configurableCache := &Cache{expiration: defaultExpirationInterval, purgeInterval: defaultPurgeInterval}
for _, opt := range opts {
opt(configurableCache)
}

// The underlying cache is initialized with the configured expiration and purge intervals.
configurableCache.c = cache.New(configurableCache.expiration, configurableCache.purgeInterval)
return configurableCache
}

// CacheEntry represents a single entry in the cache, consisting of a key and its corresponding value.
type CacheEntry struct {
// Key is the unique identifier for the entry.
Key string
// Value is the data stored in the entry.
Value any
}

// NewWithData constructs a new in-memory cache with existing data.
func NewWithData(ctx context.Context, data []string) *Cache {
// It also accepts CacheOption parameters to override default configuration values.
func NewWithData(ctx context.Context, data []CacheEntry, opts ...CacheOption) *Cache {
ctx.Logger().V(3).Info("Loading cache", "num-items", len(data))

instance := &Cache{expiration: defaultExpirationInterval, purgeInterval: defaultPurgeInterval}
for _, opt := range opts {
opt(instance)
}

// Convert data slice to map required by go-cache.
items := make(map[string]cache.Item, len(data))
for _, d := range data {
items[d] = cache.Item{Object: d, Expiration: int64(defaultExpiration)}
items[d.Key] = cache.Item{Object: d.Value, Expiration: int64(defaultExpiration)}
}

c := cache.NewFrom(expirationInterval, purgeInterval, items)
return &Cache{c: c}
instance.c = cache.NewFrom(instance.expiration, instance.purgeInterval, items)
return instance
}

// Set adds a key-value pair to the cache.
func (c *Cache) Set(key, value string) {
func (c *Cache) Set(key string, value any) {
c.c.Set(key, value, defaultExpiration)
}

// Get returns the value for the given key.
func (c *Cache) Get(key string) (string, bool) {
res, ok := c.c.Get(key)
if !ok {
return "", ok
}
return res.(string), ok
func (c *Cache) Get(key string) (any, bool) {
return c.c.Get(key)
}

// Exists returns true if the given key exists in the cache.
Expand Down Expand Up @@ -85,11 +121,11 @@ func (c *Cache) Keys() []string {
}

// Values returns all values in the cache.
func (c *Cache) Values() []string {
func (c *Cache) Values() []any {
items := c.c.Items()
res := make([]string, 0, len(items))
res := make([]any, 0, len(items))
for _, v := range items {
res = append(res, v.Object.(string))
res = append(res, v.Object)
}
return res
}
Expand Down
12 changes: 8 additions & 4 deletions pkg/cache/memory/memory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ func TestCache(t *testing.T) {
// Test delete.
c.Delete("key1")
v, ok = c.Get("key1")
if ok || v != "" {
if ok || v != nil {
t.Fatalf("Unexpected value for key1 after delete: %v, %v", v, ok)
}

// Test clear.
c.Set("key10", "key10")
c.Clear()
v, ok = c.Get("key10")
if ok || v != "" {
if ok || v != nil {
t.Fatalf("Unexpected value for key10 after clear: %v, %v", v, ok)
}

Expand All @@ -60,7 +60,10 @@ func TestCache(t *testing.T) {
}

// Test getting only the values.
vals := c.Values()
vals := make([]string, 0, c.Count())
for _, v := range c.Values() {
vals = append(vals, v.(string))
}
sort.Strings(vals)
sort.Strings(values)
if !cmp.Equal(values, vals) {
Expand All @@ -82,7 +85,8 @@ func TestCache(t *testing.T) {
}

func TestCache_NewWithData(t *testing.T) {
c := NewWithData(logContext.Background(), []string{"key1", "key2", "key3"})
data := []CacheEntry{{"key1", "value1"}, {"key2", "value2"}, {"key3", "value3"}}
c := NewWithData(logContext.Background(), data)

// Test the count.
if c.Count() != 3 {
Expand Down
45 changes: 44 additions & 1 deletion pkg/detectors/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (
"strings"
"time"

"github.com/trufflesecurity/trufflehog/v3/pkg/cache"
"github.com/trufflesecurity/trufflehog/v3/pkg/cache/memory"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
Expand All @@ -20,6 +22,8 @@ import (
type scanner struct {
verificationClient *http.Client
skipIDs map[string]struct{}

credsCache cache.Cache
}

// resourceTypes derived from: https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_identifiers.html#identifiers-unique-ids
Expand Down Expand Up @@ -47,6 +51,11 @@ func New(opts ...func(*scanner)) *scanner {
opt(scanner)
}

scanner.credsCache = memory.New(
memory.WithExpirationInterval(1*time.Hour),
Copy link
Collaborator Author

@ahrav ahrav Jan 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are just placeholders for now, we could consider passing these in as args to New, or use a default value that seems reasonable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How big could the cache conceivably get in a given time frame?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good question. It would really depend on what was being scanned. We can run some tests to scan some repos and record some metrics to see what the size of this cache grows to be. Alternatively, we could use a LRU cache where we fix the size if needed.

memory.WithPurgeInterval(2*time.Hour),
)

return scanner
}

Expand Down Expand Up @@ -101,6 +110,26 @@ func GetHMAC(key []byte, data []byte) []byte {
return hasher.Sum(nil)
}

// cacheItem represents an item stored in the cache, encompassing the outcome of a verification process.
// It includes the verification result, ExtraData, and VerificationErrors encountered during verification.
// This facilitates the reconstruction of detectors.Result with values for previously verified creds.
type cacheItem struct {
isVerified bool
verificationErr error
extra map[string]string
}

func newCacheItem(isVerified bool, verificationErr error, extra map[string]string) *cacheItem {
return &cacheItem{isVerified, verificationErr, extra}
}

// populateResult populates the given detectors.Result with the values from the cacheItem.
func (c *cacheItem) populateResult(result *detectors.Result) {
result.Verified = c.isVerified
result.ExtraData = c.extra
result.SetVerificationError(c.verificationErr)
}

// FromData will find and optionally verify AWS secrets in a given set of bytes.
func (s scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {
dataStr := string(data)
Expand All @@ -126,16 +155,28 @@ func (s scanner) FromData(ctx context.Context, verify bool, data []byte) (result
}
resSecretMatch := strings.TrimSpace(secretMatch[1])

rawV2 := resIDMatch + resSecretMatch

s1 := detectors.Result{
DetectorType: detectorspb.DetectorType_AWS,
Raw: []byte(resIDMatch),
Redacted: resIDMatch,
RawV2: []byte(resIDMatch + resSecretMatch),
RawV2: []byte(rawV2),
ExtraData: map[string]string{
"resource_type": resourceTypes[idMatch[2]],
},
}

if val, ok := s.credsCache.Get(rawV2); ok {
item, ok := val.(*cacheItem)
if !ok {
continue
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is the right behavior, i'd like to maybe log this as an error? Not sure if constructing our custom logcontext.AddLogger make sense so we can do that?

}
item.populateResult(&s1)
results = append(results, s1)
continue
}

if verify {
isVerified, extraData, verificationErr := s.verifyMatch(ctx, resIDMatch, resSecretMatch, true)
s1.Verified = isVerified
Expand All @@ -153,6 +194,8 @@ func (s scanner) FromData(ctx context.Context, verify bool, data []byte) (result
}
}

// Cache the result.
s.credsCache.Set(rawV2, newCacheItem(s1.Verified, s1.VerificationError(), s1.ExtraData))
if !s1.Verified {
// Unverified results that contain common test words are probably not secrets
if detectors.IsKnownFalsePositive(resSecretMatch, detectors.DefaultFalsePositives, true) {
Expand Down
55 changes: 55 additions & 0 deletions pkg/detectors/aws/aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@ import (
"context"
"crypto/sha256"
"fmt"
"net/http"
"testing"
"time"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"

"github.com/trufflesecurity/trufflehog/v3/pkg/cache/memory"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
Expand Down Expand Up @@ -326,6 +329,58 @@ func TestAWS_FromChunk(t *testing.T) {
}
}

// TestAWSFromDataCacheDuplicateCreds tests that duplicate credentials are not verified against the AWS API
// multiple times.
func TestAWSFromDataCacheDuplicateCreds(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
defer cancel()

testSecrets, err := common.GetSecret(ctx, "trufflehog-testing", "detectors4")
if err != nil {
t.Fatalf("could not get test secrets from GCP: %s", err)
}
secret := testSecrets.MustGetField("AWS")
id := testSecrets.MustGetField("AWS_ID")

// Mock HTTP client to intercept AWS API requests and count them.
apiCallCount := 0
mockClient := &http.Client{
Transport: roundTripperFunc(func(req *http.Request) (*http.Response, error) {
apiCallCount++
return &http.Response{StatusCode: http.StatusOK}, nil
}),
}

detector := scanner{verificationClient: mockClient, credsCache: memory.New()}

testData := []byte(fmt.Sprintf("You can find a aws secret %s within aws %s", secret, id))

// First call - expect cache to be empty and an API call to be made.
_, err = detector.FromData(context.Background(), true, testData)
if err != nil {
t.Fatalf("Error processing data: %s", err)
}
if apiCallCount != 1 {
t.Fatalf("Expected 1 API call, got %d", apiCallCount)
}

// Second call with the same data - expect cache to be used and no additional API calls.
_, err = detector.FromData(context.Background(), true, testData)
if err != nil {
t.Fatalf("Error processing data: %s", err)
}
if apiCallCount != 1 {
t.Fatalf("Cache did not work as expected, API call count: %d", apiCallCount)
}
}

// roundTripperFunc type is an adapter to allow the use of ordinary functions as HTTP round trippers.
type roundTripperFunc func(*http.Request) (*http.Response, error)

func (f roundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) {
return f(r)
}

func BenchmarkFromData(benchmark *testing.B) {
ctx := context.Background()
s := scanner{}
Expand Down
10 changes: 8 additions & 2 deletions pkg/sources/gcs/gcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func newPersistableCache(increment int, cache cache.Cache, p *sources.Progress)

// Set overrides the cache Set method of the cache to enable the persistence
// of the cache contents the Progress of the source at given increments.
func (c *persistableCache) Set(key, val string) {
func (c *persistableCache) Set(key string, val any) {
c.Cache.Set(key, val)
if ok, contents := c.shouldPersist(); ok {
c.Progress.EncodedResumeInfo = contents
Expand Down Expand Up @@ -297,7 +297,13 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ .
func (s *Source) setupCache(ctx context.Context) *persistableCache {
var c cache.Cache
if s.Progress.EncodedResumeInfo != "" {
c = memory.NewWithData(ctx, strings.Split(s.Progress.EncodedResumeInfo, ","))
keys := strings.Split(s.Progress.EncodedResumeInfo, ",")
entries := make([]memory.CacheEntry, len(keys))
for i, val := range keys {
entries[i] = memory.CacheEntry{Key: val, Value: val}
}

c = memory.NewWithData(ctx, entries)
} else {
c = memory.New()
}
Expand Down
10 changes: 9 additions & 1 deletion pkg/sources/github/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,15 @@ func (s *Source) enumerate(ctx context.Context, apiEndpoint string) (*github.Cli
return nil, errors.Errorf("Invalid configuration given for source. Name: %s, Type: %s", s.name, s.Type())
}

s.repos = s.filteredRepoCache.Values()
s.repos = make([]string, 0, s.filteredRepoCache.Count())
for _, repo := range s.filteredRepoCache.Values() {
r, ok := repo.(string)
if !ok {
ctx.Logger().Error(fmt.Errorf("type assertion failed"), "unexpected value in cache", "repo", repo)
continue
}
s.repos = append(s.repos, r)
}
githubReposEnumerated.WithLabelValues(s.name).Set(float64(len(s.repos)))
s.log.Info("Completed enumeration", "num_repos", len(s.repos), "num_orgs", s.orgsCache.Count(), "num_members", len(s.memberCache))

Expand Down
Loading