This repository has been archived by the owner on Feb 12, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfileListMetadataReconcile.py
110 lines (96 loc) · 3.49 KB
/
fileListMetadataReconcile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
import csv
import time
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--directory', help='the directory of the files. optional - if not provided, the script will ask for input')
parser.add_argument('-f', '--fileNameCSV', help='the metadata CSV file. optional - if not provided, the script will ask for input')
parser.add_argument('-e', '--fileExtension', help='the file extension. optional - if not provided, the script will ask for input')
args = parser.parse_args()
if args.directory:
directory = args.directory
else:
directory = input('Enter directory (C:/Test/): ')
if args.fileNameCSV:
fileNameCSV = args.fileNameCSV
else:
fileNameCSV = input('Enter metadata CSV file: ')
if args.fileExtension:
fileExtension = args.fileExtension
else:
fileExtension = input('Enter file extension: ')
startTime = time.time()
fileIdentifierList = []
for root, dirs, files in os.walk(directory, topdown=True):
for file in files:
if file.endswith(fileExtension):
file.replace('.'+fileExtension,'')
fileIdentifierList.append(file)
elapsedTime = time.time() - startTime
m, s = divmod(elapsedTime, 60)
h, m = divmod(m, 60)
print('File list creation time: ','%d:%02d:%02d' % (h, m, s))
f=csv.writer(open('collectionfileList.csv', 'w'))
f.writerow(['fileName'])
for file in fileIdentifierList:
f.writerow([file])
metadataIdentifierList = []
f=csv.writer(open('metadataFileList.csv', 'w'))
f.writerow(['metadataItemID'])
with open(fileNameCSV) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
value = row['fileIdentifier']
f.writerow([value])
metadataIdentifierList.append(value)
fileMatches = []
for fileID in fileIdentifierList:
for metadataID in metadataIdentifierList:
if fileID.startswith(metadataID):
fileMatches.append(fileID)
f=csv.writer(open('filesNotInMetadata.csv', 'w'))
f.writerow(['fileItemID'])
filesNotInMetadata = set(fileIdentifierList) - set(fileMatches)
for file in filesNotInMetadata:
f.writerow([file])
metadataMatches = []
for metadataID in metadataIdentifierList:
for fileID in fileIdentifierList:
if fileID.startswith(metadataID):
metadataMatches.append(metadataID)
metadataWithNoFiles = set(metadataIdentifierList) - set(metadataMatches)
with open(fileNameCSV) as csvfile:
f=csv.writer(open('metadataWithNoFiles.csv', 'w'))
reader = csv.DictReader(csvfile)
header = next(reader)
headerRow = []
for k,v in header.iteritems():
headerRow.append(k)
f.writerow(headerRow)
for row in reader:
csvRow = []
for metadata in metadataWithNoFiles:
if metadata == row['fileIdentifier']:
for value in headerRow:
csvRow.append(row[value])
f.writerow(csvRow)
with open(fileNameCSV) as csvfile:
f=csv.writer(open('metadataWithFiles.csv', 'w'))
reader = csv.DictReader(csvfile)
header = next(reader)
headerRow = []
for k,v in header.iteritems():
headerRow.append(k)
f.writerow(headerRow)
for row in reader:
csvRow = []
for metadata in metadataMatches:
if metadata == row['fileIdentifier']:
for value in headerRow:
csvRow.append(row[value])
f.writerow(csvRow)
elapsedTime = time.time() - startTime
m, s = divmod(elapsedTime, 60)
h, m = divmod(m, 60)
print('Total script run time: ', '%d:%02d:%02d' % (h, m, s))