Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wip] csv dialect #1

Merged
merged 15 commits into from
Oct 9, 2015
4 changes: 2 additions & 2 deletions riwo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def name(self):
# str
@property
def encoding(self):
if hasattr(self.resource, 'encoding'):
if hasattr(self.resource, 'encoding') and self.resource.encoding is not None:
return str(self.resource.encoding)

return 'utf-8'
Expand Down Expand Up @@ -170,7 +170,7 @@ def write(self):

# void
def write_item(self, item):
raise extension.NotImplemented("{self}'s `write_item(item)` function is not implemented yet!" \
raise exceptions.NotImplemented("{self}'s `write_item(item)` function is not implemented yet!" \
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks. 👍

.format(self=self.name))

from .dialects import *
Expand Down
11 changes: 11 additions & 0 deletions riwo/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@
long = int
unicode = str

import io
StringIO=io.StringIO

import urllib.request
import urllib.parse
urlparse = urllib.parse.urlparse
urlopen = urllib.request.urlopen


else:
string_types = basestring,
integer_types = (int, long)
Expand All @@ -26,6 +30,13 @@
binary_type = str
long = long

import codecs
getencoder=codecs.getincrementalencoder
getreader=codecs.getreader

import cStringIO
StringIO=cStringIO.StringIO

import urllib
import urlparse
urlparse = urlparse.urlparse
Expand Down
91 changes: 82 additions & 9 deletions riwo/dialects/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,12 @@
QUOTE_NONE
)

# TODO: Python 2.7 compatibility is missing. (encoding is failing)

class Reader(AbstractReader):
class Py3Reader(AbstractReader):
# void
def __init__(self, resource, schema, offset=0, limit=None, use_header=False, **fmtparams):
self.fmtparams = fmtparams
self.use_header = use_header
super(Reader,self).__init__(resource, schema, offset, limit)
super(Py3Reader,self).__init__(resource, schema, offset, limit)

# function
def get_mapper(self):
Expand All @@ -32,23 +30,47 @@ def get_mapper(self):

# Iterable (csv.reader or csv.DictReader)
def get_iterable_data(self):
resource_gen = (decode(r, self.encoding) for r in to_iterable(self.resource)) # csv package can't open bytestream
resource_gen = (decode(r, self.encoding) for r in to_iterable(self.resource))
return csv.reader(resource_gen, **self.fmtparams) \
if not self.use_header \
else csv.DictReader(resource_gen, **self.fmtparams)

class Writer(AbstractWriter):
class Py2Reader(AbstractReader):
# void
def __init__(self, resource, schema, offset=0, limit=None, use_header=False, **fmtparams):
self.fmtparams = fmtparams
self.use_header = use_header
super(Py2Reader,self).__init__(resource, schema, offset, limit)

# function
def get_mapper(self):
return daprot.mapper.INDEX \
if not self.use_header \
else daprot.mapper.NAME

# Iterable
def get_iterable_data(self):
resource_gen = (encode(r, self.encoding) for r in to_iterable(self.resource))
reader_gen = csv.reader(resource_gen, **self.fmtparams)
result_gen = ( [unicode(c, self.encoding) for c in r] for r in reader_gen )
if not self.use_header:
return result_gen

header = result_gen.next()
return (dict(zip(header,r)) for r in result_gen)

class Py3Writer(AbstractWriter):
# void
def __init__(self, resource, iterable_data, input_schema=None, add_header=True, **fmtparams):
self.fmtparams = fmtparams
self.add_header = add_header
super(Writer, self).__init__(resource, iterable_data, input_schema)
super(Py3Writer, self).__init__(resource, iterable_data, input_schema)

if self.is_nested():
raise exceptions.NestedSchemaNotSupported("{self} is not support nested schemas." \
.format(self=self.name))

# csv.writer
# csv.DictWriter
def init_writer(self):
return csv.DictWriter(self.resource, fieldnames=self.fieldnames, **self.fmtparams)

Expand All @@ -63,8 +85,59 @@ def unmarshal_item(self, item):
# void
def write(self):
if self.add_header: self.writer.writeheader()
super(Writer, self).write()
super(Py3Writer, self).write()

# void
def write_item(self, item):
self.writer.writerow(unmarshal(item, self.unmarshal_item))

class Py2Writer(Py3Writer):
# UnicodeWriter
def init_writer(self):
return UnicodeWriter(self.resource, **self.fmtparams)

# void
def write_header(self):
self.writer.writerow(self.fieldnames)

# void
def write(self):
if self.add_header: self.write_header()
for item in self.reader:
self.write_item(item)

# void
def write_item(self, item):
data = unmarshal(item, self.unmarshal_item)
self.writer.writerow([data[c] for c in self.fieldnames])

class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = getencoder(encoding)()

def writerow(self, row):
self.writer.writerow([encode(s, "utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = decode(data, "utf-8")
# ... and reencode it into the target encoding
# data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)

def writerows(self, rows):
for row in rows:
self.writerow(row)

if PY3:
Reader = Py3Reader
Writer = Py3Writer
else:
Reader = Py2Reader
Writer = Py2Writer
174 changes: 174 additions & 0 deletions riwo/tests/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import
import io
import os
import riwo
import daprot as dp
import string
import random
import datetime
import unittest
import requests
from riwo.compat import *
from . import CommonReader, __dir__, __remote__

class IndexSchema(dp.SchemaFlow):
id = dp.Field(0)
name = dp.Field(1, type=unicode, transforms=unicode.strip)
price = dp.Field(2)
quantity = dp.Field(3, type=long)
updated_at = dp.Field(4)

class HeaderedSchema(dp.SchemaFlow):
id = dp.Field()
name = dp.Field(type=unicode, transforms=unicode.strip)
price = dp.Field()
quantity = dp.Field(type=long)
updated_at = dp.Field()

class LocalReader(CommonReader, unittest.TestCase):
def setUp(self):
self.resource = io.open(os.path.join(__dir__, 'test.csv'), 'r', encoding='utf-8')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a note for us. We should add tests for the use_header and add_header attribute.

self.reader = riwo.csv.Reader(self.resource, IndexSchema, delimiter=';')

class LocalHeaderReader(CommonReader, unittest.TestCase):
def setUp(self):
self.resource = io.open(os.path.join(__dir__, 'test-header.csv'), 'r', encoding='utf-8')
self.reader = riwo.csv.Reader(self.resource, HeaderedSchema, use_header=True)

class RequestsReader(CommonReader, unittest.TestCase):
def setUp(self):
self.resource = requests.get(os.path.join(__remote__, 'test.csv'))
self.reader = riwo.csv.Reader(self.resource, IndexSchema, delimiter=';')

class RequestsHeaderReader(CommonReader, unittest.TestCase):
def setUp(self):
self.resource = requests.get(os.path.join(__remote__, 'test-header.csv'))
self.reader = riwo.csv.Reader(self.resource, HeaderedSchema, use_header=True)

class UrllibReader(CommonReader, unittest.TestCase):
def setUp(self):
self.resource = urlopen(os.path.join(__remote__, 'test.csv'))
self.reader = riwo.csv.Reader(self.resource, IndexSchema, delimiter=';')

class UrllibHeaderReader(CommonReader, unittest.TestCase):
def setUp(self):
self.resource = urlopen(os.path.join(__remote__, 'test-header.csv'))
self.reader = riwo.csv.Reader(self.resource, HeaderedSchema, use_header=True)

class LocalWriter(unittest.TestCase):
class Schema(dp.SchemaFlow):
id = dp.Field()
name = dp.Field()
price = dp.Field()
quantity = dp.Field()
updated_at = dp.Field()

class NestedSchema(dp.SchemaFlow):
inner_schema = dp.DictOf(IndexSchema)

iterable_input = CommonReader.expected_result
test_file_base = os.path.join(__dir__, 'output-{token}.csv')

def setUp(self):
self.test_file_path = self.test_file_base.format(token=''.join([random.choice(string.ascii_uppercase) for i in range(6)]))
self.resource = io.open(self.test_file_path, 'w', encoding='utf-8')

def test_add_header(self):
self.writer = riwo.csv.Writer(self.resource, self.iterable_input, self.Schema, add_header=True)
self.writer.write()
self.content = u'''\
id,name,price,quantity,updated_at
P0001,đói,449,1,2015.09.20 20:00
P0002,배고픈,399,1,2015.09.20 20:02
P0003,голодный,199,10,
P0004,Űrállomás krízis,"999,5",1,2015.09.20 12:47
P0005,Ovális iroda,2 399,1,2015.09.20 07:31
'''

def test_not_add_header(self):
self.writer = riwo.csv.Writer(self.resource, self.iterable_input, self.Schema, add_header=False)
self.writer.write()
self.content = u'''\
P0001,đói,449,1,2015.09.20 20:00
P0002,배고픈,399,1,2015.09.20 20:02
P0003,голодный,199,10,
P0004,Űrállomás krízis,"999,5",1,2015.09.20 12:47
P0005,Ovális iroda,2 399,1,2015.09.20 07:31
'''

def test_delimiter(self):
self.writer = riwo.csv.Writer(self.resource, self.iterable_input, self.Schema, delimiter='\001')
self.writer.write()
self.content = u'''\
id\001name\001price\001quantity\001updated_at
P0001\001đói\001449\0011\0012015.09.20 20:00
P0002\001배고픈\001399\0011\0012015.09.20 20:02
P0003\001голодный\001199\00110\001
P0004\001Űrállomás krízis\001999,5\0011\0012015.09.20 12:47
P0005\001Ovális iroda\0012 399\0011\0012015.09.20 07:31
'''

def test_quote(self):
self.writer = riwo.csv.Writer(self.resource, self.iterable_input, self.Schema, \
delimiter='\\', quotechar='`', quoting=riwo.csv.QUOTE_ALL)
self.writer.write()
self.content = u'''\
`id`\`name`\`price`\`quantity`\`updated_at`
`P0001`\`đói`\`449`\`1`\`2015.09.20 20:00`
`P0002`\`배고픈`\`399`\`1`\`2015.09.20 20:02`
`P0003`\`голодный`\`199`\`10`\``
`P0004`\`Űrállomás krízis`\`999,5`\`1`\`2015.09.20 12:47`
`P0005`\`Ovális iroda`\`2 399`\`1`\`2015.09.20 07:31`
'''

def test_defined_schema_writer(self):
self.writer = riwo.csv.Writer(self.resource, self.iterable_input, self.Schema)
self.writer.write()
self.content = u'''\
id,name,price,quantity,updated_at
P0001,đói,449,1,2015.09.20 20:00
P0002,배고픈,399,1,2015.09.20 20:02
P0003,голодный,199,10,
P0004,Űrállomás krízis,"999,5",1,2015.09.20 12:47
P0005,Ovális iroda,2 399,1,2015.09.20 07:31
'''

def test_schema_flow_writer(self):
self.writer = riwo.csv.Writer(self.resource, self.Schema(self.iterable_input, mapper=dp.mapper.NAME))
self.writer.write()
self.content = u'''\
id,name,price,quantity,updated_at
P0001,đói,449,1,2015.09.20 20:00
P0002,배고픈,399,1,2015.09.20 20:02
P0003,голодный,199,10,
P0004,Űrállomás krízis,"999,5",1,2015.09.20 12:47
P0005,Ovális iroda,2 399,1,2015.09.20 07:31
'''

def test_without_schema(self):
with self.assertRaises(riwo.exceptions.SchemaRequired):
riwo.csv.Writer(self.resource, self.iterable_input)
self.content = u''

def test_unmarshal(self):
self.writer = riwo.csv.Writer(self.resource, [], self.Schema)
self.content = u''

current_datetime = datetime.datetime.now()
self.assertEqual(self.writer.unmarshal_item(current_datetime), current_datetime.isoformat())
self.assertEqual(self.writer.unmarshal_item(4.21), u'4.21')
self.assertEqual(self.writer.unmarshal_item(u'голодный'), u'голодный')

def test_requisites(self):
with self.assertRaises(riwo.exceptions.NestedSchemaNotSupported):
riwo.csv.Writer(self.resource, [], self.NestedSchema)
self.content = u''

def tearDown(self):
self.resource.close()
with io.open(self.test_file_path, 'r', encoding='utf-8') as f:
content = f.read()
self.assertEqual(self.content, content)
os.remove(self.test_file_path)
1 change: 1 addition & 0 deletions riwo/tests/source/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
output-*
10 changes: 5 additions & 5 deletions riwo/tests/source/test.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
P0001,đói,449,1,2015.09.20 20:00
P0002,배고픈,399,1,2015.09.20 20:02
P0003,голодный,199,10,
P0004,Űrállomás krízis,"999,5",1,2015.09.20 12:47
P0005,Ovális iroda,2 399,1,2015.09.20 07:31
P0001;đói;449;1;2015.09.20 20:00
P0002;배고픈;399;1;2015.09.20 20:02
P0003;голодный;199;10;
P0004;Űrállomás krízis;"999,5";1;2015.09.20 12:47
P0005;Ovális iroda;2 399;1;2015.09.20 07:31