-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfind_dup.py
113 lines (105 loc) · 3.66 KB
/
find_dup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
from sys import argv, exit
from hashlib import sha256 # collision-resistant and safer than md5
usage = 'Usage: >> python find_duplicate_files.py folder OR >> python find_duplicate_files.py folder1 folder2 folder3'
def hash_file(path, blocksize = 65536):
'''
Calculate the sha256 of a given file
by reading binary data in blocks of 2**16
The function receives the path to the file and
returns the HEX digest of that file
'''
with open(path, 'rb') as afile:
hasher = sha256()
while True:
buf = afile.read(blocksize)
if not buf:
break
hasher.update(buf)
return hasher.hexdigest()
def find_dup_size(folder):
'''
Finds duplicates by file size.
Dups in format {filesize: [filepath]}
'''
dups ={}
for dirName, subdirs, fileList in os.walk(folder):
print('Scanning %s ...'% dirName)
for filename in fileList:
# Get the path to the file
path = os.path.join(dirName, filename)
# Get sizes
file_size = os.path.getsize(path)
# Add or append the file path
if file_size in dups:
dups[file_size].append(path)
else:
dups[file_size] = [path]
return dups
def find_dup_hash(file_list):
'''
Compares equal size files by hashfile
'''
print('Comparing: ')
for filename in file_list:
print(' {}'.format(filename))
dups = {}
for path in file_list:
file_hash = hash_file(path)
if file_hash in dups:
dups[file_hash].append(path)
else:
dups[file_hash] = [path]
return dups
def join_dicts(dict1, dict2):
'''
Joins two dictionaries
'''
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]
def write_results(dict1, folders):
'''
Writes results to output file
'''
results = list(filter(lambda x: len(x)>1, dict1.values()))
with open('duplicates.txt', 'w', encoding='utf-8') as f:
f.write('Folders scanned: ' + str(folders)+ '\n')
if len(results) > 0:
f.write('Duplicates Found:\n')
f.write('The following files are indentical. The name could differ, but the content is identical\n')
f.write('----'*15 + '\n')
for result in results:
for subresult in result:
f.write('\t\t%s\n' % subresult)
f.write('----'*15 + '\n')
print('Please, see the duplicates.txt file in your computer!')
else:
print('No duplicate file found')
f.write('No duplicate file found')
def main(argv):
'''
Do the main work with directories given as argv
'''
if len(argv) > 1:
dup_size = {}
folders = argv[1:]
for folder in folders: # iterate the folders given
if os.path.exists(folder):
join_dicts(dup_size, find_dup_size(folder)) # find the duplicated files by size and append them to the dup_size
else:
print('%s is not a valid path, please verify' % folder)
exit(1)
print('Comparing files with the same size... ')
dups = {}
for dup_list in dup_size.values():
if len(dup_list) > 1:
join_dicts(dups, find_dup_hash(dup_list)) # find duplicated by hash from dup_size
write_results(dups, folders)
else:
print(usage)
exit(2)
if __name__ == '__main__':
main(argv)