-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathwalker.go
357 lines (320 loc) · 10.3 KB
/
walker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fswalker
import (
"context"
"fmt"
"log"
"os"
"path/filepath"
"strings"
"sync"
"github.com/google/uuid"
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/google/fswalker/internal/fsstat"
"github.com/google/fswalker/internal/metrics"
fspb "github.com/google/fswalker/proto/fswalker"
)
const (
// Number of root paths to walk in parallel.
parallelism = 1
// Versions for compatibility comparison.
fileVersion = 1
walkVersion = 1
// Unique names for each counter - used by the counter output processor.
countFiles = "file-count"
countDirectories = "dir-count"
countFileSizeSum = "file-size-sum"
countStatErr = "file-stat-errors"
countHashes = "file-hash-count"
)
// WalkCallback is called by Walker at the end of the Run.
// The callback is typically used to dump the walk to disk and/or perform any other checks.
// The error return value is propagated back to the Run callers.
type WalkCallback func(context.Context, *fspb.Walk) error
// WalkerFromPolicyFile creates a new Walker based on a policy path.
func WalkerFromPolicyFile(ctx context.Context, path string) (*Walker, error) {
pol := &fspb.Policy{}
if err := readTextProto(ctx, path, pol); err != nil {
return nil, err
}
return &Walker{
pol: pol,
Counter: &metrics.Counter{},
}, nil
}
// Walker is able to walk a file structure starting with a list of given includes
// as roots. All paths starting with any prefix specified in the excludes are
// ignored. The list of specific files in the hash list are read and a hash sum
// built for each. Note that this is expensive and should not be done for large
// files or a large number of files.
type Walker struct {
// pol is the configuration defining which paths to include and exclude from the walk.
pol *fspb.Policy
// walk collects all processed files during a run.
walk *fspb.Walk
walkMu sync.Mutex
// Function to call once the Walk is complete i.e. to inspect or write the Walk.
WalkCallback WalkCallback
// Verbose, when true, makes Walker print file metadata to stdout.
Verbose bool
// Counter records stats over all processed files, if non-nil.
Counter *metrics.Counter
}
// convert creates a File from the given information and if requested embeds the hash sum too.
func (w *Walker) convert(path string, info os.FileInfo) (*fspb.File, error) {
path = filepath.Clean(path)
f := &fspb.File{
Version: fileVersion,
Path: path,
}
if info == nil {
return f, nil
}
var shaSum string
// Only build the hash sum if requested and if it is not a directory.
if w.wantHashing(path) && !info.IsDir() && info.Size() <= w.pol.MaxHashFileSize {
var err error
shaSum, err = sha256sum(path)
if err != nil {
log.Printf("unable to build hash for %s: %s", path, err)
} else {
f.Fingerprint = []*fspb.Fingerprint{
{
Method: fspb.Fingerprint_SHA256,
Value: shaSum,
},
}
}
}
mts := timestamppb.New(info.ModTime())
f.Info = &fspb.FileInfo{
Name: info.Name(),
Size: info.Size(),
Mode: uint32(info.Mode()),
Modified: mts,
IsDir: info.IsDir(),
}
var err error
if f.Stat, err = fsstat.ToStat(info); err != nil {
return nil, err
}
return f, nil
}
// wantHashing determines whether the given path was asked to be hashed.
func (w *Walker) wantHashing(path string) bool {
for _, p := range w.pol.HashPfx {
if strings.HasPrefix(path, p) {
return true
}
}
return false
}
// isExcluded determines whether a given path was asked to be excluded from scanning.
func (w *Walker) isExcluded(path string) bool {
for _, e := range w.pol.ExcludePfx {
if strings.HasPrefix(path, e) {
return true
}
}
return false
}
// process runs output functions for the given input File.
func (w *Walker) process(ctx context.Context, f *fspb.File) error {
// Print a short overview if we're running in verbose mode.
if w.Verbose {
fmt.Println(NormalizePath(f.Path, f.Info.IsDir))
ts := proto.Clone(f.Info.Modified)
info := []string{
fmt.Sprintf("size(%d)", f.Info.Size),
fmt.Sprintf("mode(%v)", os.FileMode(f.Info.Mode)),
fmt.Sprintf("mTime(%v)", ts),
fmt.Sprintf("uid(%d)", f.Stat.Uid),
fmt.Sprintf("gid(%d)", f.Stat.Gid),
fmt.Sprintf("inode(%d)", f.Stat.Inode),
}
for _, fp := range f.Fingerprint {
info = append(info, fmt.Sprintf("%s(%s)", fspb.Fingerprint_Method_name[int32(fp.Method)], fp.Value))
}
fmt.Println(strings.Join(info, ", "))
}
// Add file to the walk which will later be written out to disk.
w.addFileToWalk(f)
// Collect some metrics.
if w.Counter != nil {
if f.Info.IsDir {
w.Counter.Add(1, countDirectories)
} else {
w.Counter.Add(1, countFiles)
}
w.Counter.Add(f.Info.Size, countFileSizeSum)
if f.Stat == nil {
w.Counter.Add(1, countStatErr)
}
if len(f.Fingerprint) > 0 {
w.Counter.Add(1, countHashes)
}
}
return nil
}
func (w *Walker) addFileToWalk(f *fspb.File) {
w.walkMu.Lock()
w.walk.File = append(w.walk.File, f)
w.walkMu.Unlock()
}
func (w *Walker) addNotificationToWalk(s fspb.Notification_Severity, path, msg string) {
w.walkMu.Lock()
w.walk.Notification = append(w.walk.Notification, &fspb.Notification{
Severity: s,
Path: path,
Message: msg,
})
w.walkMu.Unlock()
}
// relDirDepth calculates the path depth relative to the origin.
func (w *Walker) relDirDepth(origin, path string) uint32 {
return uint32(len(strings.Split(path, string(filepath.Separator))) - len(strings.Split(origin, string(filepath.Separator))))
}
// worker is a worker routine that reads paths from chPaths and walks all the files and
// subdirectories until the channel is exhausted. All discovered files are converted to
// File and processed with w.process().
func (w *Walker) worker(ctx context.Context, chPaths <-chan string) error {
for path := range chPaths {
baseInfo, err := os.Stat(path)
if err != nil {
return fmt.Errorf("unable to get file info for base path %q: %v", path, err)
}
baseDev, err := fsstat.DevNumber(baseInfo)
if err != nil {
return fmt.Errorf("unable to get file stat on base path %q: %v", path, err)
}
if err := filepath.WalkDir(path, func(p string, d os.DirEntry, err error) error {
// If the initial stat of the root dir fails we can get a nil value for d along with the
// PathError from os.Stat. See WalkDirFunc for details.
if d == nil && err != nil {
msg := fmt.Sprintf("failed to stat root dir %q: %s", p, err)
log.Print(msg)
w.addNotificationToWalk(fspb.Notification_WARNING, p, msg)
return err
}
// This catches the other error state of d != nil and err != nil, indicating
// there was an error with the directory's ReadDir call.
if err != nil {
msg := fmt.Sprintf("failed to walk %q: %s", p, err)
log.Print(msg)
w.addNotificationToWalk(fspb.Notification_WARNING, p, msg)
return nil // returning SkipDir on a file would skip the rest of the files in the dir
}
info, err := d.Info()
if err != nil {
msg := fmt.Sprintf("failed to get FileInfo for %q: %s", d.Name(), err)
log.Print(msg)
return nil
}
p = NormalizePath(p, info.IsDir())
// Checking various exclusions based on flags in the walker policy.
if w.isExcluded(p) {
if w.Verbose {
w.addNotificationToWalk(fspb.Notification_INFO, p, fmt.Sprintf("skipping %q: excluded", p))
}
if info.IsDir() {
return filepath.SkipDir
}
return nil // returning SkipDir on a file would skip the rest of the files in the dir
}
if w.pol.IgnoreIrregularFiles && !info.Mode().IsRegular() && !info.IsDir() {
if w.Verbose {
w.addNotificationToWalk(fspb.Notification_INFO, p, fmt.Sprintf("skipping %q: irregular file (mode: %s)", p, info.Mode()))
}
return nil
}
f, err := w.convert(p, info)
if err != nil {
return err
}
if w.pol.MaxDirectoryDepth > 0 && info.IsDir() && w.relDirDepth(path, p) > w.pol.MaxDirectoryDepth {
w.addNotificationToWalk(fspb.Notification_WARNING, p, fmt.Sprintf("skipping %q: more than %d into base path %q", p, w.pol.MaxDirectoryDepth, path))
return filepath.SkipDir
}
if !w.pol.WalkCrossDevice && f.Stat != nil && baseDev != f.Stat.Dev {
msg := fmt.Sprintf("skipping %q: file is on different device", p)
log.Printf(msg)
if w.Verbose {
w.addNotificationToWalk(fspb.Notification_INFO, p, msg)
}
if info.IsDir() {
return filepath.SkipDir
}
return nil // returning SkipDir on a file would skip the rest of the files in the dir
}
return w.process(ctx, f)
}); err != nil {
return fmt.Errorf("error walking root include path %q: %v", path, err)
}
}
return nil
}
// Run is the main function of Walker. It discovers all files under included paths
// (minus excluded ones) and processes them.
// This does NOT follow symlinks - fortunately we don't need it either.
func (w *Walker) Run(ctx context.Context) error {
walkID := uuid.New().String()
hn, err := os.Hostname()
if err != nil {
return err
}
w.walk = &fspb.Walk{
Version: walkVersion,
Id: walkID,
Policy: w.pol,
Hostname: hn,
StartWalk: timestamppb.Now(),
}
chPaths := make(chan string, 10)
var wg sync.WaitGroup
var errs []string
var errsMu sync.Mutex
for i := 0; i < parallelism; i++ {
wg.Add(1)
go func() {
defer wg.Done()
if err := w.worker(ctx, chPaths); err != nil {
errsMu.Lock()
errs = append(errs, err.Error())
errsMu.Unlock()
}
}()
}
includes := map[string]bool{}
for _, p := range w.pol.Include {
p := filepath.Clean(p)
if _, ok := includes[p]; ok {
continue
}
includes[p] = true
chPaths <- p
}
close(chPaths)
wg.Wait()
if len(errs) != 0 {
return fmt.Errorf("unable to complete Walk:\n%s", strings.Join(errs, "\n"))
}
// Finishing work by writing out the report.
w.walk.StopWalk = timestamppb.Now()
if w.WalkCallback == nil {
return nil
}
return w.WalkCallback(ctx, w.walk)
}