-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathEPFParser.py
296 lines (250 loc) · 12.2 KB
/
EPFParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# Copyright (c) 2010 Apple Inc. All rights reserved.
# IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") in
# consideration of your agreement to the following terms, and your use,
# installation, modification or redistribution of this Apple software
# constitutes acceptance of these terms. If you do not agree with these terms,
# please do not use, install, modify or redistribute this Apple software.
# In consideration of your agreement to abide by the following terms, and subject
# to these terms, Apple grants you a personal, non-exclusive license, under Apple's
# copyrights in this original Apple software (the "Apple Software"), to use,
# reproduce, modify and redistribute the Apple Software, with or without
# modifications, in source and/or binary forms; provided that if you redistribute
# the Apple Software in its entirety and without modifications, you must retain
# this notice and the following text and disclaimers in all such redistributions
# of the Apple Software. Neither the name, trademarks, service marks or logos of
# Apple Inc. may be used to endorse or promote products derived from the Apple
# Software without specific prior written permission from Apple. Except as
# expressly stated in this notice, no other rights or licenses, express or implied,
# are granted by Apple herein, including but not limited to any patent rights that
# may be infringed by your derivative works or by other works in which the Apple
# Software may be incorporated.
# The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
# WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
# WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN
# COMBINATION WITH YOUR PRODUCTS.
# IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION
# OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT
# (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import re
import logging
import codecs
LOGGER = logging.getLogger()
class SubstringNotFoundException(Exception):
"""
Exception thrown when a comment character or other tag is not found in a situation where it's required.
"""
class Parser(object):
"""
Parses an EPF file.
During initialization, all the file db metadata is stored, and the
file seek position is set to the beginning of the first data record.
The Parser object can then be used directly by an Ingester to create
and populate the table.
typeMap is a dictionary mapping datatype strings in the file to corresponding
types for the database being used. The default map is for MySQL.
"""
commentChar = "#"
recordDelim = "\x02\n"
fieldDelim = "\x01"
primaryKeyTag = "primaryKey:"
dataTypesTag = "dbTypes:"
exportModeTag = "exportMode:"
recordCountTag = "recordsWritten:"
def __init__(self, filePath, typeMap={"CLOB":"LONGTEXT"}, recordDelim='\x02\n', fieldDelim='\x01'):
self.dataTypeMap = typeMap
self.numberTypes = ["INTEGER", "INT", "BIGINT", "TINYINT"]
self.dateTypes = ["DATE", "DATETIME", "TIME", "TIMESTAMP"]
self.columnNames = []
self.primaryKey = []
self.dataTypes = []
self.exportMode = None
self.dateColumns = [] #fields containing dates need special treatment; we'll cache the indexes here
self.numberColumns = [] #numeric fields don't accept NULL; we'll cache the indexes here to use later
self.typeMap = None
self.recordsExpected = 0
self.latestRecordNum = 0
self.commentChar = Parser.commentChar
self.recordDelim = recordDelim
self.fieldDelim = fieldDelim
self.eFile = codecs.open(filePath, mode="r", encoding="utf-8") #this will throw an exception if filePath does not exist
#Unicode characters can be up to 6 bytes; step backwards until we don't split across one...
byteRange = range(-40, -46, -1)
for pos in byteRange:
try:
self.eFile.seek(pos, os.SEEK_END) #seek to the end
str = self.eFile.read() #reads from pos to end of file
except UnicodeDecodeError, e:
if pos == byteRange[-1]: raise #reraise error if last byte is still bad
else:
break #found a good byte, stop looking!
#Parse the recordsWritten line
lst = str.split(self.commentChar + Parser.recordCountTag)
numStr = lst.pop().rpartition(self.recordDelim)[0]
self.recordsExpected = int(numStr)
self.eFile.seek(0, os.SEEK_SET) #seek back to the beginning
#Extract the column names
line1 = self.nextRowString(ignoreComments=False)
self.columnNames = self.splitRow(line1, requiredPrefix=self.commentChar)
#We'll now grab the rest of the header data, without assuming a particular order
primStart = self.commentChar+Parser.primaryKeyTag
dtStart = self.commentChar+Parser.dataTypesTag
exStart = self.commentChar+Parser.exportModeTag
#Grab the next 6 lines, which should include all the header comments
firstRows=[]
for j in range(6):
firstRows.append(self.nextRowString(ignoreComments=False))
firstRows = [aRow for aRow in firstRows if aRow] #strip None rows (possible if the file is < 6 rows)
#Loop through the rows, extracting the header info
for aRow in firstRows:
if aRow.startswith(primStart):
self.primaryKey = self.splitRow(aRow, requiredPrefix=primStart)
self.primaryKey = ([] if self.primaryKey == [''] else self.primaryKey)
elif aRow.startswith(dtStart):
dts = self.splitRow(aRow, requiredPrefix=dtStart)
#HACK doing terrible things to make the retail_price column big enough
self.dataTypes = ['DECIMAL(11,3)' if dt == 'DECIMAL(9,3)' else dt for dt in dts]
elif aRow.startswith(exStart):
self.exportMode = self.splitRow(aRow, requiredPrefix=exStart)[0]
self.eFile.seek(0, os.SEEK_SET) #seek back to the beginning
#Convert any datatypes to mapped counterparts, and cache indexes of date/time types and number types
for j in range(len(self.dataTypes)):
dType = self.dataTypes[j]
if self.dataTypeMap.has_key(dType):
self.dataTypes[j] = self.dataTypeMap[dType]
if dType in self.dateTypes:
self.dateColumns.append(j)
if dType in self.numberTypes:
self.numberColumns.append(j)
#Build a dictionary of column names to data types
self.typeMap = dict(zip(self.columnNames, self.dataTypes))
def setSeekPos(self, pos=0):
"""
Sets the underlying file's seek position.
This is useful for resuming a partial ingest that was interrupted for some reason.
"""
self.eFile.seek(pos)
def getSeekPos(self):
"""
Gets the underlying file's seek position.
"""
return self.eFile.tell()
seekPos = property(fget=getSeekPos, fset=setSeekPos, doc="Seek position of the underlying file")
def seekToRecord(self, recordNum):
"""
Set the seek position to the beginning of the recordNumth record.
Seeks to the beginning of the file if recordNum <=0,
or the end if it's greater than the number of records.
"""
self.seekPos = 0
self.latestRecordNum = 0
if (recordNum <= 0):
return
for j in range(recordNum):
self.advanceToNextRecord()
def nextRowString(self, ignoreComments=True):
"""
Returns (as a string) the next row of data (as delimited by self.recordDelim),
ignoring comments if ignoreComments is True.
Leaves the delimiters in place.
Unfortunately Python doesn't allow line-based reading with user-supplied line separators
(http://bugs.python.org/issue1152248), so we use normal line reading and then concatenate
when we hit 0x02.
"""
lst = []
isFirstLine = True
while (True):
ln = self.eFile.readline()
if (not ln): #end of file
break
if (isFirstLine and ignoreComments and ln.find(self.commentChar) == 0): #comment
continue
lst.append(ln)
if isFirstLine:
isFirstLine = False
if (ln.find(self.recordDelim) != -1): #last textual line of this record
break
if (len(lst) == 0):
return None
else:
rowString = "".join(lst) #concatenate the lines into a single string, which is the full content of the row
return rowString
def advanceToNextRecord(self):
"""
Performs essentially the same task as nextRowString, but without constructing or returning anything.
This allows much faster access to a record in the middle of the file.
"""
while (True):
ln = self.eFile.readline()
if (not ln): #end of file
return
if (ln.find(self.commentChar) == 0): #comment; always skip
continue
if (ln.find(self.recordDelim) != -1): #last textual line of this record
break
self.latestRecordNum += 1
def splitRow(self, rowString, requiredPrefix=None):
"""
Given rowString, strips requiredPrefix and self.recordDelim,
then splits on self.fieldDelim, returning the resulting list.
If requiredPrefix is not present in the row, throws a SubstringNotFound exception
"""
if (requiredPrefix):
ix = rowString.find(requiredPrefix)
if (ix != 0):
expl = "Required prefix '%s' was not found in '%s'" % (requiredPrefix, rowString)
raise SubstringNotFoundException, expl
rowString = rowString.partition(requiredPrefix)[2]
str = rowString.partition(self.recordDelim)[0]
return str.split(self.fieldDelim)
def nextRecord(self):
"""
Returns the next row of data as a list, or None if we're out of data.
"""
rowString = self.nextRowString()
if (rowString):
self.latestRecordNum += 1 #update the record counter
rec = self.splitRow(rowString)
rec = rec[:len(self.columnNames)] #if there are more data records than column names,
#trim any surplus records via a slice
#replace empty strings with NULL
for i in range(len(rec)):
val = rec[i]
rec[i] = ("NULL" if val == "" else val)
#massage dates into MySQL-compatible format.
#most date values look like '2009 06 21'; some are '2005-09-06-00:00:00-Etc/GMT'
#there are also some cases where there's only a year; we'll pad it out with a bogus month/day
yearMatch = re.compile(r"^\d\d\d\d$")
for j in self.dateColumns:
rec[j] = rec[j].strip().replace(" ", "-")[:19] #Include at most the first 19 chars
if yearMatch.match(rec[j]):
rec[j] = "%s-01-01" % rec[j]
return rec
else:
return None
def nextRecords(self, maxNum=100):
"""
Returns the next maxNum records (or fewer if EOF) as a list of lists.
"""
records = []
for j in range(maxNum):
lst = self.nextRecord()
if (not lst):
break
records.append(lst)
return records
def nextRecordDict(self):
"""
Returns the next row of data as a dictionary, keyed by the column names.
"""
vals = self.nextRecord()
if (not vals):
return None
else:
keys = self.columnNames
return dict(zip(keys, vals))