This repository has been archived by the owner on Aug 3, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
marc2csv.py
executable file
·225 lines (194 loc) · 6.21 KB
/
marc2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/env python
#-*- coding:utf-8 -*-
"""
Converts MARC 21 to CSV.
"""
import chardet, codecs, cStringIO, csv, sys
from pymarc import MARCReader
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
Taken (and extended) from: <http://docs.python.org/library/csv.html>
"""
def __init__(self, file, dialect = csv.excel, encoding = "utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect = dialect, **kwds)
self.stream = file
self.encoder = codecs.getincrementalencoder(encoding)()
def decode(self, string, encodings = ("ascii", "utf8", "latin1")):
"""
Decode a string.
"""
# Try to decode string with the most common encodings.
for encoding in encodings:
try:
return string.decode(encoding)
except UnicodeDecodeError:
pass
# If not decoded, try to guess the encoding.
guess = chardet.detect(string)
if guess["encoding"]:
return string.decode(guess["encoding"])
# If everything fails, just decode it as ASCII and ignore errors.
return string.decode("ascii", "ignore")
def encode(self, string):
"""
Encode a string into UTF-8.
"""
try:
return string.encode("UTF-8")
except (UnicodeEncodeError, UnicodeDecodeError):
# If encoding attempt fails, try to decode the string first.
string = self.decode(string)
return string.encode("UTF-8")
def cleanrow(self, row):
"""
Try to clean the encodings in a CSV row first.
"""
cleanRow = []
for s in row:
if isinstance(s, int):
s = str(s)
s = self.encode(s)
cleanRow.append(s)
return cleanRow
def writerow(self, row):
cleanRow = self.cleanrow(row)
self.writer.writerow(cleanRow)
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class MARC2CSV (object):
"""
Converts MARC records to CSV. Outputs CSV lines to stdout.
"""
def __init__(self,):
"""
Load the MARC file.
"""
if len(sys.argv) > 1:
filepath = sys.argv[1]
else:
raise Exception(
"You need to provide a file path to the MARC file as an argument."
)
try:
self.reader = MARCReader(
open("{0}".format(filepath), "r"),
to_unicode = True, # This seems to clean it a little bit.
force_utf8 = True
)
except IOError:
print >> sys.stderr, "Cannot open {0}".format(filepath)
raise SystemExit
# A file to log records without field 001 (system number)
self.log = open("{0}-records-wo-001.log".format(filepath), "w")
self.outputFile = open("{0}.csv".format(filepath), "wb")
self.writer = UnicodeWriter(
self.outputFile,
delimiter = ",",
quoting = csv.QUOTE_MINIMAL
)
def logRecord(self, record):
"""
Log a simple text dump of a record.
"""
recordDump = ["{0}: {1}".format(str(field.tag), field.value()) for field in record.get_fields()]
output = "\n".join(recordDump)
self.log.write(output + "\n")
def checkRecord(self, record):
"""
Check if we have the primary identificator from the field 001.
"""
if record["001"]:
return True
else:
self.logRecord(record)
return False
def writeRow(self, row):
"""
Printed CSV header:
System number, field tag, number of field's occurrence, first indicator,
secord indicator, subfield label, number of subfield's occurrence, value
"""
self.writer.writerow(row)
def processRecord(self, record):
"""
Process individual MARC record
"""
if self.checkRecord(record):
# Initialize dict for tracking
# which field tags and how many times were used.
usedTags = {}
sysno = record["001"].value()
for field in record.fields:
if not field.value() == "": # Skip empty fields
# Initialize dict for tracking
# which field tags and how many times were used
usedSubfields = {}
tag = field.tag
# Increment used field tags
if not usedTags.has_key(tag):
usedTags[tag] = 1
else:
usedTags[tag] += 1
# Get field indicators
try:
ind1, ind2 = field.indicators
except AttributeError: # The field has no indicators
ind1 = ind2 = ""
try:
# The fields has some subfields.
for index, subfield in enumerate(field.subfields):
if index % 2: # Odd (zero-based) items are subfield values
value = subfield
self.writeRow([
sysno,
tag,
usedTags[tag],
ind1,
ind2,
subfieldLabel,
usedSubfields[subfieldLabel],
value
])
else: # Even (zero-based) items are subfield labels
subfieldLabel = subfield
# Increment used subfield labels
if not usedSubfields.has_key(subfieldLabel):
usedSubfields[subfieldLabel] = 1
else:
usedSubfields[subfieldLabel] += 1
except AttributeError:
# The field has no subfields.
self.writeRow([
sysno,
tag,
usedTags[tag],
ind1,
ind2,
"", # Empty subfield label
"", # Empty subfield count
field.value()
])
def main(self):
for record in self.reader:
self.processRecord(record)
self.log.close()
self.outputFile.close()
def main():
m2c = MARC2CSV()
m2c.main()
if __name__ == "__main__":
main()