-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstructured_file_formats.py
383 lines (292 loc) · 12.8 KB
/
structured_file_formats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
'''Structured Text Files'''
# In simple text files, the only level of organization is the line. Sometimes,
# you need more structure than that. You might want to save data for your
# program to use later, or send data to another program. There are many
# formats. Each of these can be read and written by at least one Python module:
# – A separator (or delimiter) character like tab '\t', comma ',', or vertical
# bar '|'. This is an example of the comma-separated values (CSV) format.
# – < and > around tags. Examples include XML and HTML.
# – Punctuation. An example is JavaScript Object Notation (JSON).
# – Indentation. An example is YAML (.yaml, .yml)
# – Miscellaneous, such as configuration files for programs.
# CSV
# -----------------------------------------------------------------------------
# Delimited files are often used as an exchange format for spreadsheets and
# databases. You could read CSV files manually, a line at a time, splitting
# each line into fields at comma separators, and adding the results to data
# structures such as lists and dictionaries. But it's better to use the
# standard csv module, because parsing these files can be more complicated.
# Some have alternate delimiters besides a comma like '|' and '\t' (tab)
# Some have escape sequences. If the delimiter character can occur within a
# field, the entire field might be surrounded by quote characters or preceded
# by some escape character. Files have different line-ending characters. Unix
# uses '\n', Microsoft uses '\r \n', and Apple used to use '\r' but now '\n'.
# There can be column names in the first line.
import csv
from pprint import pprint
singers = [
['Maynard', 'James Keenan'],
['Thom', 'York'],
['Alison', 'Mosshart'],
['Win', 'Butler'],
['Etta', 'James'],
]
with open('data/singers.csv', 'w') as fout:
csv_out = csv.writer(fout)
csv_out.writerows(singers)
# This creates a file 'singers.csv'. Try reading it back in:
with open('data/singers.csv', 'r') as fin:
csv_in = csv.reader(fin)
singers = [row for row in csv_in] # list comprehension
# Using reader() and writer() with their default options, columns are
# separated by commas and rows by line feeds.
# DictReader()
# The data can be interpreted as list of dictionaries instead of lists:
with open('data/singers.csv', 'r') as fin:
csv_in = csv.DictReader(fin, fieldnames=['first', 'last'])
singers = [row for row in csv_in]
pprint(singers)
# [OrderedDict([('first', 'Maynard'), ('last', 'James Keenan')]),
# OrderedDict([('first', 'Thom'), ('last', 'York')]),
# OrderedDict([('first', 'Alison'), ('last', 'Mosshart')]),
# OrderedDict([('first', 'Win'), ('last', 'Butler')]),
# OrderedDict([('first', 'Etta'), ('last', 'James')])]
# DictWriter() will use a list of dictionaries to write the CSV file.
# writeheader() will write an initial line of column names to the CSV file.
singers = [
{'first': 'Jack', 'last': 'White'},
{'first': 'Eddie', 'last': 'Vedder'},
{'first': 'David', 'last': 'Bowie'},
{'first': 'Josh', 'last': 'Homme'},
{'first': 'Annie', 'last': 'Lennox'},
]
with open('data/singers.csv', 'w') as fout:
csv_out = csv.DictWriter(fout, ['first', 'last'])
csv_out.writeheader()
csv_out.writerows(singers)
# This time when reading back in with DictReader(), we'll omit the fieldnames
# argument. It will then use the values it finds on the first line instead.
with open('data/singers.csv', 'r') as fin:
csv_in = csv.DictReader(fin)
singers = [row for row in csv_in]
# XML
# -----------------------------------------------------------------------------
# practice.xml code for example below:
"""
<?xml version="1.0"?>
<menu>
<breakfast hours="7-11">
<item price="$6.00">breakfast burritos</item>
<item price="$4.00">pancakes</item>
</breakfast>
<lunch hours="11-3">
<item price="$5.00">hamburger</item>
</lunch>
<dinner hours="3-10">
<item price="8.00">spaghetti</item>
</dinner>
</menu>
"""
# A few things to keep in mind when working with XML: though much like HTML,
# the choice of where to put attributes, values and nested tags-is somewhat
# arbitrary. For instance, we could see the following:
# <item price="8.00">spaghetti</item>
# <item price="$8.00" food="spaghetti"/>
# XML is often used for data feeds and messages. It's flexibility has
# inspired multiple Python libraries that differ in approach and
# capabilities. The simplest way to parse XML in Python is by using:
# ElementTree:
# This will parse the menu.xml file and print some tags and attributes:
import xml.etree.ElementTree as et
tree = et.ElementTree(file='data/practice.xml')
root = tree.getroot()
print(root.tag)
for child in root:
print('tag:', child.tag, 'attributes:', child.attrib)
for grandchild in child:
print('\ttag:', grandchild.tag, 'attributes:', grandchild.attrib)
print(len(root)) # number of menu sections
print(len(root[0])) # number of breakfast items
# ElementTree has many other ways of searching XML-derived data, modifying
# it, and even writing XML files. The ElementTree documentation can be found:
# https://docs.python.org/3.3/library/xml.etree.elementtree.html
# Other standard Python XML libraries include:
# xml.dom - This module loads the entire XML file into memory and lets you
# access all the pieces equally.
# xml.sax - simple API for XML, parses XML on the fly, so it does not have
# to load everything into memory at once. A good choice if you need
# to process very large streams of XML.
# HTML
# -----------------------------------------------------------------------------
# The problem here is so much of it doesn't follow the HTML rules, which can
# make it difficult to parse. Also, much of HTML is intended more to format
# output than interchange data. More to come...
# JSON
# -----------------------------------------------------------------------------
# Has become a very popular data interchange format. The JSON format is a
# subset of JavaScript, and often legal Python syntax as well. Its close fit
# to Python makes it a good choice for data interchange among programs.
# The JSON module can encode python data to a JSON *string* (dumps) and
# decode a JSON string back to python data (loads).
# The module also includes methods that can encode to a JSON *file* (dump) and
# decode from a JSON file (load).
import json
# Here's a pythonic data structure to use:
menu = {
'breakfast': {
'hours': '7-11',
'items': {
'breakfast burritos': '$6.00',
'pancakes': '$4.00',
'bagel': '$2.50',
}
},
'lunch': {
'hours': '11-3',
'items': {'sandwich': '$8.00'}
},
'dinner': {
'hours': '3-10',
'items': {'spaghetti': '$9.00'}
}
}
# Encode the data structure to a JSON *string* by using dumps():
menu_json = json.dumps(menu)
print(type(menu_json))
# <class 'str'>
# Decode the JSON string back into a python data structure using loads():
menu_from_string = json.loads(menu_json)
print(type(menu_from_string))
# <class 'dict'>
# Note that the order of the keys may be different from when we started.
# Encode the data structure to a JSON *file* by using dump():
with open('data/menu.json', 'w') as fob:
json.dump(menu, fob)
# Load the JSON file back into a python data structure using load():
with open('data/menu.json', 'r') as fob:
menu_from_file = json.load(fob)
print(menu_from_file['lunch']['items'])
# {'sandwich': '$8.00'}
# You might get an exception while trying to encode or decode some objects,
# such as datetime. This can happen because the JSON standard does not define
# date or time types; it expects you to define how to handle them. You could
# convert the datetime to something JSON understands, such as a string or an
# epoch value. More to come...
# If the datetime value could occur in the middle of normally converted data
# types, it might be annoying to make these special conversions. You can
# modify how JSON is encoded by using inheritance. Python's JSON
# documentation gives an example of this for complex numbers, which can be
# modified for datetime:
import datetime
from time import mktime
now = datetime.datetime.utcnow()
class DTEncoder(json.JSONEncoder):
def default(self, obj):
# isinstance() checks the type of object:
if isinstance(obj, datetime.datetime):
return int(mktime(obj.timetuple()))
# else we'll assume it's something the decoder knows:
return json.JSONEncoder.default(self, obj)
json.dumps(now, cls=DTEncoder)
# The new class DTEncoder is a subclass, or child class, of JSONEncoder.
# We only need to override its default() method to add datetime handling.
# Inheritance ensures that everything else will be handled by the parent
# class. The isinstance() function checks whether the object obj is of the
# class datetime.datetime.
# For JSON and other structured text formats, you can load from a file into
# data structures without knowing anything about the structures ahead of
# time. Then, you can walk through the structures by using isinstance() and
# type-appropriate methods to examine their values. For example, if one of
# the items is a dictionary, you can extract contents through keys(),
# values(), and items().
# see also: json_example.py
# YAML
# -----------------------------------------------------------------------------
# practice.yaml code for example below:
"""
name:
first: James
last: McIntyre
dates:
birth: 1828-05-25
death: 1906-03-31
details:
bearded: true
themes: [cheese, Canada]
books:
url: http://www.gutenberg.org/files/36068/36068-h/36068-h.htm
poems:
- title: 'Motto'
text: |
Politeness, perseverance and pluck,
To their possessor will bring good luck.
- title: 'Canadian Charms'
text: |
Here industry is not in vain,
For we have bounteous crops of grain,
And you behold on every field
Of grass and roots abundant yield,
But after all the greatest charm
Is the snug home upon the farm,
And stone walls now keep cattle warm.
"""
# Similar to JSON, YAML has keys and values, but handles more data types
# such as dates and times. The standard Python library does not yet include
# YAML handling, so you need to install a third-party library named yaml to
# manipulate it. safe_load() converts a YAML string to Python data, whereas
# dump() does the opposite.
# Values such as true, false, on, and off are converted to Python Booleans.
# Integers and strings are converted to their Python equivalents. Other
# syntax creates lists and dictionaries:
import yaml
with open('data/practice.yaml', 'r') as fin:
text = fin.read()
data = yaml.safe_load(text)
print(data['details'])
print(len(data['poems']))
print(data['poems'][1]['title'])
# PyYAML can load() Python objects from strings, and this is dangerous.
# Always Use safe_load() instead of load(), especially if you're importing
# YAML that you don't trust.
# Configuration files
# -----------------------------------------------------------------------------
# The standard configparser module handles Windows-style .ini files.
# Such files have sections of key = value definitions.
# practice.cfg code for example below:
"""
[english]
greeting = Hello
[french]
greeting = Bonjour
[files]
home = /usr/local
# simple interpolation:
bin = %(home)s/bin
"""
import configparser
cfg = configparser.ConfigParser()
cfg.read('data/practice.cfg')
print(cfg['french']['greeting'])
# Serialize with pickle
# -----------------------------------------------------------------------------
# see pickling.py
# Shelve module
# -----------------------------------------------------------------------------
# see shelve_module.py
# Spreadsheets
# -----------------------------------------------------------------------------
# Spreadsheets, notably Microsoft Excel, are widespread binary data formats.
# If you can save your spreadsheet to a CSV file, you can read it by using
# the standard csv module that was described earlier. If you have a binary
# xls file, xlrd is a third-party package for reading and writing.
# HDF5
# -----------------------------------------------------------------------------
# is a binary data format for multidimensional or hierarchical numeric data.
# It's used mainly in science, where fast random access to large datasets
# (gigabytes to terabytes) is a common requirement. It's best suited to WORM
# (write once/read many) applications for which database protection against
# conflicting writes is not needed. Consider HDF5 when you need to store and
# retrieve large amounts of data and are willing to consider something
# outside the box (as well as the usual database solutions). HDF5 modules:
# h5py is a full-featured low-level interface
# PyTables is a bit higher-level, with database-like features