-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[feat] - tmp file diffs #2306
[feat] - tmp file diffs #2306
Changes from 23 commits
f2f13b2
7d2452a
0802a62
0365023
57e5d6a
dad92c5
fd41709
da2ff88
b3260ff
73903e7
994c268
f8e87e9
bba85f4
2e7479e
8cc4246
f114a4a
bf25bd2
390a5a5
bb82471
9af37d8
bab3f04
f26ac45
cbd4ddf
7aef649
47544ad
3c7a1d5
a9ee71a
419107e
1bd8b27
c259b84
1485c9a
73a766f
1699097
3ac95f5
e442142
bff34a4
fd5181e
1c577db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,6 @@ import ( | |
"bufio" | ||
"bytes" | ||
"fmt" | ||
"github.com/go-logr/logr" | ||
"io" | ||
"os" | ||
"os/exec" | ||
|
@@ -13,6 +12,8 @@ import ( | |
"strings" | ||
"time" | ||
|
||
"github.com/go-logr/logr" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/common" | ||
"github.com/trufflesecurity/trufflehog/v3/pkg/context" | ||
) | ||
|
@@ -22,12 +23,96 @@ const ( | |
defaultDateFormat = "Mon Jan 02 15:04:05 2006 -0700" | ||
|
||
// defaultMaxDiffSize is the maximum size for a diff. Larger diffs will be cut off. | ||
defaultMaxDiffSize = 1 * 1024 * 1024 * 1024 // 1GB | ||
defaultMaxDiffSize = 2 * 1024 * 1024 * 1024 // 1GB | ||
|
||
// defaultMaxCommitSize is the maximum size for a commit. Larger commits will be cut off. | ||
defaultMaxCommitSize = 1 * 1024 * 1024 * 1024 // 1GB | ||
defaultMaxCommitSize = 2 * 1024 * 1024 * 1024 // 1GB | ||
ahrav marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
|
||
// contentWriter defines a common interface for writing, reading, and managing diff content. | ||
// It abstracts the underlying storage mechanism, allowing flexibility in how content is handled. | ||
// This interface enables the use of different content storage strategies (e.g., in-memory buffer, file-based storage) | ||
// based on performance needs or resource constraints, providing a unified way to interact with different content types. | ||
type contentWriter interface { // Write appends data to the content storage. | ||
// Write appends data to the content storage. | ||
Write(ctx context.Context, data []byte) (int, error) | ||
// ReadCloser provides a reader for accessing stored content. | ||
ReadCloser() (io.ReadCloser, error) | ||
rosecodym marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Close finalizes the content storage, performing cleanup if necessary. | ||
Close() error | ||
// Len returns the current size of the content. | ||
Len() int | ||
// String returns the content as a string. | ||
String(ctx context.Context) string | ||
ahrav marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
// buffer is a wrapper around bytes.Buffer, implementing the contentWriter interface. | ||
// This allows bytes.Buffer to be used wherever a contentWriter is required, ensuring compatibility | ||
// with the contentWriter interface while leveraging the existing implementation of bytes.Buffer. | ||
type buffer struct{ bytes.Buffer } | ||
|
||
// Write delegates the writing operation to the underlying bytes.Buffer, ignoring the context. | ||
// The context is included to satisfy the contentWriter interface, allowing for future extensions | ||
// where context handling might be necessary (e.g., for timeouts or cancellation). | ||
func (b *buffer) Write(_ context.Context, data []byte) (int, error) { return b.Buffer.Write(data) } | ||
|
||
// ReadCloser provides a read-closer for the buffer's content. | ||
// It wraps the buffer's content in a NopCloser to provide a ReadCloser without additional closing behavior, | ||
// as closing a bytes.Buffer is a no-op. | ||
func (b *buffer) ReadCloser() (io.ReadCloser, error) { | ||
return io.NopCloser(bytes.NewReader(b.Bytes())), nil | ||
} | ||
|
||
// Close is a no-op for buffer, as there is no resource cleanup needed for bytes.Buffer. | ||
func (b *buffer) Close() error { return nil } | ||
|
||
// String returns the buffer's content as a string. | ||
func (b *buffer) String(_ context.Context) string { return b.Buffer.String() } | ||
|
||
// Diff contains the information about a file diff in a commit. | ||
// It abstracts the underlying content representation, allowing for flexible handling of diff content. | ||
// The use of contentWriter enables the management of diff data either in memory or on disk, | ||
// based on its size, optimizing resource usage and performance. | ||
type Diff struct { | ||
PathB string | ||
LineStart int | ||
contentWriter contentWriter | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the only line that changed in this struct + the comment. Moved it so it's grouped closer to its constructor and methods. |
||
IsBinary bool | ||
} | ||
|
||
type diffOption func(*Diff) | ||
|
||
// withPathB sets the PathB option. | ||
func withPathB(pathB string) diffOption { return func(d *Diff) { d.PathB = pathB } } | ||
|
||
// NewDiff creates a new Diff with a threshold. | ||
func NewDiff(opts ...diffOption) *Diff { | ||
diff := new(Diff) | ||
diff.contentWriter = new(buffer) | ||
for _, opt := range opts { | ||
opt(diff) | ||
} | ||
|
||
return diff | ||
} | ||
|
||
// Len returns the length of the storage. | ||
func (d *Diff) Len() int { return d.contentWriter.Len() } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These methods probably can be removed, but I was trying to avoid reaching into the contentWriter directly. This is also exported since we need access to this in the git source. Open to changing it if we feel these are providing any additional encapsulation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want to guard them behind the state like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like that idea. Will guard them. |
||
|
||
// ReadCloser returns a ReadCloser for the contentWriter. | ||
func (d *Diff) ReadCloser() (io.ReadCloser, error) { return d.contentWriter.ReadCloser() } | ||
|
||
// write delegates to the contentWriter. | ||
func (d *Diff) write(ctx context.Context, p []byte) error { | ||
_, err := d.contentWriter.Write(ctx, p) | ||
return err | ||
} | ||
|
||
// finalize ensures proper closure of resources associated with the Diff. | ||
// handle the final flush in the finalize method, in case there's data remaining in the buffer. | ||
// This method should be called to release resources, especially when writing to a file. | ||
func (d *Diff) finalize() error { return d.contentWriter.Close() } | ||
|
||
// Commit contains commit header info and diffs. | ||
type Commit struct { | ||
Hash string | ||
|
@@ -38,19 +123,44 @@ type Commit struct { | |
Size int // in bytes | ||
} | ||
|
||
// Diff contains the info about a file diff in a commit. | ||
type Diff struct { | ||
PathB string | ||
LineStart int | ||
Content bytes.Buffer | ||
IsBinary bool | ||
// Equal compares the content of two Commits to determine if they are the same. | ||
func (c1 *Commit) Equal(ctx context.Context, c2 *Commit) bool { | ||
switch { | ||
case c1.Hash != c2.Hash: | ||
return false | ||
case c1.Author != c2.Author: | ||
return false | ||
case !c1.Date.Equal(c2.Date): | ||
return false | ||
case c1.Message.String() != c2.Message.String(): | ||
return false | ||
case len(c1.Diffs) != len(c2.Diffs): | ||
return false | ||
} | ||
|
||
for i := range c1.Diffs { | ||
d1 := c1.Diffs[i] | ||
d2 := c2.Diffs[i] | ||
switch { | ||
case d1.PathB != d2.PathB: | ||
return false | ||
case d1.LineStart != d2.LineStart: | ||
return false | ||
case d1.contentWriter.String(ctx) != d2.contentWriter.String(ctx): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly, this is the only line that changed in the |
||
return false | ||
case d1.IsBinary != d2.IsBinary: | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
|
||
// Parser sets values used in GitParse. | ||
type Parser struct { | ||
maxDiffSize int | ||
maxCommitSize int | ||
dateFormat string | ||
contentWriter contentWriter | ||
} | ||
|
||
type ParseState int | ||
|
@@ -123,45 +233,14 @@ func NewParser(options ...Option) *Parser { | |
dateFormat: defaultDateFormat, | ||
maxDiffSize: defaultMaxDiffSize, | ||
maxCommitSize: defaultMaxCommitSize, | ||
contentWriter: new(buffer), | ||
} | ||
for _, option := range options { | ||
option(parser) | ||
} | ||
return parser | ||
} | ||
|
||
// Equal compares the content of two Commits to determine if they are the same. | ||
func (c1 *Commit) Equal(c2 *Commit) bool { | ||
switch { | ||
case c1.Hash != c2.Hash: | ||
return false | ||
case c1.Author != c2.Author: | ||
return false | ||
case !c1.Date.Equal(c2.Date): | ||
return false | ||
case c1.Message.String() != c2.Message.String(): | ||
return false | ||
case len(c1.Diffs) != len(c2.Diffs): | ||
return false | ||
} | ||
for i := range c1.Diffs { | ||
d1 := c1.Diffs[i] | ||
d2 := c2.Diffs[i] | ||
switch { | ||
case d1.PathB != d2.PathB: | ||
return false | ||
case d1.LineStart != d2.LineStart: | ||
return false | ||
case d1.Content.String() != d2.Content.String(): | ||
return false | ||
case d1.IsBinary != d2.IsBinary: | ||
return false | ||
} | ||
} | ||
return true | ||
|
||
} | ||
|
||
// RepoPath parses the output of the `git log` command for the `source` path. | ||
func (c *Parser) RepoPath(ctx context.Context, source string, head string, abbreviatedLog bool, excludedGlobs []string, isBare bool) (chan Commit, error) { | ||
args := []string{"-C", source, "log", "-p", "--full-history", "--date=format:%a %b %d %H:%M:%S %Y %z"} | ||
|
@@ -254,11 +333,11 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch | |
outReader := bufio.NewReader(stdOut) | ||
var ( | ||
currentCommit *Commit | ||
currentDiff Diff | ||
|
||
totalLogSize int | ||
) | ||
var latestState = Initial | ||
currentDiff := NewDiff() | ||
|
||
defer common.RecoverWithExit(ctx) | ||
defer close(commitChan) | ||
|
@@ -277,20 +356,28 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch | |
latestState = CommitLine | ||
|
||
// If there is a currentDiff, add it to currentCommit. | ||
if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { | ||
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff) | ||
currentCommit.Size += currentDiff.Content.Len() | ||
if currentDiff.Len() > 0 || currentDiff.IsBinary { | ||
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) | ||
currentCommit.Size += currentDiff.Len() | ||
} | ||
// If there is a currentCommit, send it to the channel. | ||
if currentCommit != nil { | ||
if err := currentDiff.finalize(); err != nil { | ||
ctx.Logger().Error( | ||
err, | ||
"failed to finalize diff", | ||
"commit", currentCommit.Hash, | ||
"diff", currentDiff.PathB, | ||
"size", currentDiff.Len(), | ||
"latest_state", latestState.String(), | ||
) | ||
} | ||
commitChan <- *currentCommit | ||
totalLogSize += currentCommit.Size | ||
} | ||
// Create a new currentDiff and currentCommit | ||
currentDiff = Diff{} | ||
currentCommit = &Commit{ | ||
Message: strings.Builder{}, | ||
} | ||
currentDiff = NewDiff() | ||
currentCommit = &Commit{Message: strings.Builder{}} | ||
// Check that the commit line contains a hash and set it. | ||
if len(line) >= 47 { | ||
currentCommit.Hash = string(line[7:47]) | ||
|
@@ -326,12 +413,15 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch | |
if currentCommit == nil { | ||
currentCommit = &Commit{} | ||
} | ||
if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { | ||
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff) | ||
if currentDiff.Len() > 0 || currentDiff.IsBinary { | ||
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) | ||
if err := currentDiff.finalize(); err != nil { | ||
ctx.Logger().Error(err, "failed to finalize diff") | ||
ahrav marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
// If the currentDiff is over 1GB, drop it into the channel so it isn't held in memory waiting for more commits. | ||
totalSize := 0 | ||
for _, diff := range currentCommit.Diffs { | ||
totalSize += diff.Content.Len() | ||
totalSize += diff.Len() | ||
} | ||
if totalSize > c.maxCommitSize { | ||
oldCommit := currentCommit | ||
|
@@ -348,7 +438,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch | |
currentCommit.Message.WriteString(oldCommit.Message.String()) | ||
} | ||
} | ||
currentDiff = Diff{} | ||
currentDiff = NewDiff() | ||
case isModeLine(isStaged, latestState, line): | ||
latestState = ModeLine | ||
// NoOp | ||
|
@@ -375,12 +465,10 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch | |
case isHunkLineNumberLine(isStaged, latestState, line): | ||
latestState = HunkLineNumberLine | ||
|
||
if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { | ||
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff) | ||
} | ||
currentDiff = Diff{ | ||
PathB: currentDiff.PathB, | ||
if currentDiff.Len() > 0 || currentDiff.IsBinary { | ||
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) | ||
} | ||
currentDiff = NewDiff(withPathB(currentDiff.PathB)) | ||
|
||
words := bytes.Split(line, []byte(" ")) | ||
if len(words) >= 3 { | ||
|
@@ -395,24 +483,21 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch | |
latestState = HunkContentLine | ||
} | ||
// TODO: Why do we care about this? It creates empty lines in the diff. If there are no plusLines, it's just newlines. | ||
currentDiff.Content.Write([]byte("\n")) | ||
if err := currentDiff.write(ctx, []byte("\n")); err != nil { | ||
ctx.Logger().Error(err, "failed to write to diff") | ||
} | ||
case isHunkPlusLine(isStaged, latestState, line): | ||
if latestState != HunkContentLine { | ||
latestState = HunkContentLine | ||
} | ||
|
||
currentDiff.Content.Write(line[1:]) | ||
case isHunkMinusLine(isStaged, latestState, line): | ||
if latestState != HunkContentLine { | ||
latestState = HunkContentLine | ||
if err := currentDiff.write(ctx, line[1:]); err != nil { | ||
ctx.Logger().Error(err, "failed to write to diff") | ||
} | ||
// NoOp. We only care about additions. | ||
case isHunkNewlineWarningLine(isStaged, latestState, line): | ||
if latestState != HunkContentLine { | ||
latestState = HunkContentLine | ||
} | ||
// NoOp | ||
case isHunkEmptyLine(isStaged, latestState, line): | ||
case isHunkMinusLine(isStaged, latestState, line), | ||
isHunkNewlineWarningLine(isStaged, latestState, line), | ||
isHunkEmptyLine(isStaged, latestState, line): | ||
if latestState != HunkContentLine { | ||
latestState = HunkContentLine | ||
} | ||
|
@@ -439,14 +524,14 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch | |
latestState = ParseFailure | ||
} | ||
|
||
if currentDiff.Content.Len() > c.maxDiffSize { | ||
if currentDiff.Len() > c.maxDiffSize { | ||
ctx.Logger().V(2).Info(fmt.Sprintf( | ||
"Diff for %s exceeded MaxDiffSize(%d)", currentDiff.PathB, c.maxDiffSize, | ||
)) | ||
break | ||
} | ||
} | ||
cleanupParse(currentCommit, ¤tDiff, commitChan, &totalLogSize) | ||
cleanupParse(currentCommit, currentDiff, commitChan, &totalLogSize) | ||
|
||
ctx.Logger().V(2).Info("finished parsing git log.", "total_log_size", totalLogSize) | ||
} | ||
|
@@ -718,7 +803,7 @@ func isCommitSeparatorLine(isStaged bool, latestState ParseState, line []byte) b | |
|
||
func cleanupParse(currentCommit *Commit, currentDiff *Diff, commitChan chan Commit, totalLogSize *int) { | ||
// Ignore empty or binary diffs (this condition may be redundant). | ||
if currentDiff != nil && (currentDiff.Content.Len() > 0 || currentDiff.IsBinary) { | ||
if currentDiff != nil && (currentDiff.Len() > 0 || currentDiff.IsBinary) { | ||
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) | ||
} | ||
if currentCommit != nil { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
increased these since we now write to disk and shouldn't run into issues with large diffs or commits.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: The comment should be updated too