From 24c4f9cd8975f7628de298d7781a485724f3b815 Mon Sep 17 00:00:00 2001 From: Greg Thole Date: Wed, 25 Sep 2013 09:25:53 -0400 Subject: [PATCH 1/8] Allow digits in header names --- warc/warc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/warc/warc.py b/warc/warc.py index 0c762a6..1503780 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -311,10 +311,11 @@ def tell(self): return self.fileobj.fileobj.tell() else: return self.fileobj.tell() - + + class WARCReader: RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n") - RE_HEADER = re.compile(r"([a-zA-Z_\-]+): *(.*)\r\n") + RE_HEADER = re.compile(r"([\w\-]+): *(.*)\r\n") SUPPORTED_VERSIONS = ["1.0"] def __init__(self, fileobj): From 3a54f6a033aea687bfb750d221b73432da6f094d Mon Sep 17 00:00:00 2001 From: Greg Thole Date: Wed, 25 Sep 2013 09:41:36 -0400 Subject: [PATCH 2/8] Add test --- .gitignore | 1 + warc/tests/test_warc.py | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 0a312c6..9de09a2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ docs/_build/ build/ .coverage htmlcov/ +virtualenv diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 92545ba..6de32ae 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -57,6 +57,7 @@ def f(type): "Content-Length: 10\r\n" + "WARC-Date: 2012-02-10T16:15:52Z\r\n" + "Content-Type: application/http; msgtype=response\r\n" + + "P3P: policyref=\"http://www.w3.org/2001/05/P3P/p3p.xml\"\r\n" + "WARC-Type: response\r\n" + "WARC-Record-ID: \r\n" + "WARC-Target-URI: http://example.com/\r\n" + From 8cec5224adfbb1047c654c294afed7faa24e533a Mon Sep 17 00:00:00 2001 From: Ben Homnick Date: Thu, 17 Oct 2013 21:16:15 +0800 Subject: [PATCH 3/8] allow dots in warc headers --- warc/tests/test_warc.py | 1 + warc/warc.py | 134 ++++++++++++++++++++-------------------- 2 files changed, 68 insertions(+), 67 deletions(-) diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 6de32ae..029a899 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -58,6 +58,7 @@ def f(type): "WARC-Date: 2012-02-10T16:15:52Z\r\n" + "Content-Type: application/http; msgtype=response\r\n" + "P3P: policyref=\"http://www.w3.org/2001/05/P3P/p3p.xml\"\r\n" + + "Page.Ly: v4.1\r\n" + "WARC-Type: response\r\n" + "WARC-Record-ID: \r\n" + "WARC-Target-URI: http://example.com/\r\n" + diff --git a/warc/warc.py b/warc/warc.py index 1503780..2908006 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -21,17 +21,17 @@ class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. - It provides dictionary like interface for accessing the headers. - + It provides dictionary like interface for accessing the headers. + The following mandatory fields are accessible also as attributes. - + * h.record_id == h['WARC-Record-ID'] * h.content_length == int(h['Content-Length']) * h.date == h['WARC-Date'] * h.type == h['WARC-Type'] - - :params headers: dictionary of headers. - :params defaults: If True, important headers like WARC-Record-ID, + + :params headers: dictionary of headers. + :params defaults: If True, important headers like WARC-Record-ID, WARC-Date, Content-Type and Content-Length are initialized to automatically if not already present. TODO: @@ -40,9 +40,9 @@ class WARCHeader(CaseInsensitiveDict): * url * ip_address * date (date of archival) - * content_type + * content_type * result_code (response code) - * checksum + * checksum * location * offset (offset from beginning of file to recrod) * filename (name of arc file) @@ -53,7 +53,7 @@ class WARCHeader(CaseInsensitiveDict): response='application/http; msgtype=response', request='application/http; msgtype=request', metadata='application/warc-fields') - + KNOWN_HEADERS = { "type": "WARC-Type", "date": "WARC-Date", @@ -65,16 +65,16 @@ class WARCHeader(CaseInsensitiveDict): "content_type": "Content-Type", "content_length": "Content-Length" } - + def __init__(self, headers, defaults=False): self.version = "WARC/1.0" CaseInsensitiveDict.__init__(self, headers) if defaults: self.init_defaults() - + def init_defaults(self): """Initializes important headers to default values, if not already specified. - + The WARC-Record-ID header is set to a newly generated UUID. The WARC-Date header is set to the current datetime. The Content-Type is set based on the WARC-Type header. @@ -86,7 +86,7 @@ def init_defaults(self): self['WARC-Date'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') if "Content-Type" not in self: self['Content-Type'] = WARCHeader.CONTENT_TYPES.get(self.type, "application/octet-stream") - + def write_to(self, f): """Writes this header to a file, in the format specified by WARC. """ @@ -99,7 +99,7 @@ def write_to(self, f): f.write(": ") f.write(value) f.write("\r\n") - + # Header ends with an extra CRLF f.write("\r\n") @@ -107,27 +107,27 @@ def write_to(self, f): def content_length(self): """The Content-Length header as int.""" return int(self['Content-Length']) - + @property - def type(self): + def type(self): """The value of WARC-Type header.""" return self.get('WARC-Type') - + @property def record_id(self): """The value of WARC-Record-ID header.""" return self['WARC-Record-ID'] - + @property def date(self): """The value of WARC-Date header.""" return self['WARC-Date'] - + def __str__(self): f = StringIO() self.write_to(f) return f.getvalue() - + def __repr__(self): return "" % (self.type, self.record_id) @@ -135,7 +135,7 @@ class WARCRecord(object): """The WARCRecord object represents a WARC Record. """ def __init__(self, header=None, payload=None, headers={}, defaults=True): - """Creates a new WARC record. + """Creates a new WARC record. """ if header is None and defaults is True: @@ -143,26 +143,26 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): self.header = header or WARCHeader(headers, defaults=True) self.payload = payload - + if defaults is True and 'Content-Length' not in self.header: if payload: self.header['Content-Length'] = str(len(payload)) else: self.header['Content-Length'] = "0" - + if defaults is True and 'WARC-Payload-Digest' not in self.header: self.header['WARC-Payload-Digest'] = self._compute_digest(payload) - + def _compute_digest(self, payload): return "sha1:" + hashlib.sha1(payload).hexdigest() - + def write_to(self, f): self.header.write_to(f) f.write(self.payload) f.write("\r\n") f.write("\r\n") f.flush() - + @property def type(self): """Record type""" @@ -172,11 +172,11 @@ def type(self): def url(self): """The value of the WARC-Target-URI header if the record is of type "response".""" return self.header.get('WARC-Target-URI') - + @property def ip_address(self): - """The IP address of the host contacted to retrieve the content of this record. - + """The IP address of the host contacted to retrieve the content of this record. + This value is available from the WARC-IP-Address header.""" return self.header.get('WARC-IP-Address') @@ -184,46 +184,46 @@ def ip_address(self): def date(self): """UTC timestamp of the record.""" return self.header.get("WARC-Date") - + @property def checksum(self): return self.header.get('WARC-Payload-Digest') - + @property def offset(self): """Offset of this record in the warc file from which this record is read. """ pass - + def __getitem__(self, name): return self.header[name] def __setitem__(self, name, value): self.header[name] = value - + def __contains__(self, name): return name in self.header - + def __str__(self): f = StringIO() self.write_to(f) return f.getvalue() - + def __repr__(self): return "" % (self.type, self['WARC-Record-ID']) - + @staticmethod def from_response(response): """Creates a WARCRecord from given response object. - This must be called before reading the response. The response can be + This must be called before reading the response. The response can be read after this method is called. - + :param response: An instance of :class:`requests.models.Response`. """ # Get the httplib.HTTPResponse object http_response = response.raw._original_response - + # HTTP status line, headers and body as strings status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason) headers = str(http_response.msg) @@ -234,7 +234,7 @@ def from_response(response): # Build the payload to create warc file. payload = status_line + "\r\n" + headers + "\r\n" + body - + headers = { "WARC-Type": "response", "WARC-Target-URI": response.request.full_url.encode('utf-8') @@ -249,19 +249,19 @@ def __init__(self, filename=None, mode=None, fileobj=None, compress=None): # initiaize compress based on filename, if not already specified if compress is None and filename and filename.endswith(".gz"): compress = True - + if compress: fileobj = gzip2.GzipFile(fileobj=fileobj, mode=mode) - + self.fileobj = fileobj self._reader = None - + @property def reader(self): if self._reader is None: self._reader = WARCReader(self.fileobj) return self._reader - + def write_record(self, warc_record): """Adds a warc record to this WARC file. """ @@ -270,32 +270,32 @@ def write_record(self, warc_record): # so that each record can be read independetly. if isinstance(self.fileobj, gzip2.GzipFile): self.fileobj.close_member() - + def read_record(self): """Reads a warc record from this WARC file.""" return self.reader.read_record() - + def __iter__(self): return iter(self.reader) - + def close(self): self.fileobj.close() - + def browse(self): """Utility to browse through the records in the warc file. - - This returns an iterator over (record, offset, size) for each record in - the file. If the file is gzip compressed, the offset and size will - corresponds to the compressed file. - - The payload of each record is limited to 1MB to keep memory consumption + + This returns an iterator over (record, offset, size) for each record in + the file. If the file is gzip compressed, the offset and size will + corresponds to the compressed file. + + The payload of each record is limited to 1MB to keep memory consumption under control. """ offset = 0 for record in self.reader: # Just read the first 1MB of the payload. - # This will make sure memory consuption is under control and it - # is possible to look at the first MB of the payload, which is + # This will make sure memory consuption is under control and it + # is possible to look at the first MB of the payload, which is # typically sufficient to read http headers in the payload. record.payload = StringIO(record.payload.read(1024*1024)) self.reader.finish_reading_current_record() @@ -304,36 +304,36 @@ def browse(self): offset = next_offset def tell(self): - """Returns the file offset. If this is a compressed file, then the + """Returns the file offset. If this is a compressed file, then the offset in the compressed file is returned. """ if isinstance(self.fileobj, gzip2.GzipFile): return self.fileobj.fileobj.tell() else: - return self.fileobj.tell() + return self.fileobj.tell() class WARCReader: RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n") - RE_HEADER = re.compile(r"([\w\-]+): *(.*)\r\n") + RE_HEADER = re.compile(r"([\w\-\.]+): *(.*)\r\n") SUPPORTED_VERSIONS = ["1.0"] - + def __init__(self, fileobj): self.fileobj = fileobj self.current_payload = None - + def read_header(self, fileobj): version_line = fileobj.readline() if not version_line: return None - + m = self.RE_VERSION.match(version_line) if not m: raise IOError("Bad version line: %r" % version_line) version = m.group(1) if version not in self.SUPPORTED_VERSIONS: raise IOError("Unsupported WARC version: %s" % version) - + headers = {} while True: line = fileobj.readline() @@ -345,13 +345,13 @@ def read_header(self, fileobj): name, value = m.groups() headers[name] = value return WARCHeader(headers) - + def expect(self, fileobj, expected_line, message=None): line = fileobj.readline() if line != expected_line: message = message or "Expected %r, found %r" % (expected_line, line) raise IOError(message) - + def finish_reading_current_record(self): # consume the footer from the previous record if self.current_payload: @@ -370,11 +370,11 @@ def read_record(self): return None else: fileobj = self.fileobj - + header = self.read_header(fileobj) if header is None: return None - + self.current_payload = FilePart(fileobj, header.content_length) record = WARCRecord(header, self.current_payload, defaults=False) return record From 90cc0e5c6bbceb90628341b3e3b725798c90be93 Mon Sep 17 00:00:00 2001 From: Ben Homnick Date: Thu, 17 Oct 2013 21:47:44 +0800 Subject: [PATCH 4/8] log warning on bad header read --- warc/tests/test_warc.py | 2 ++ warc/warc.py | 11 +++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 029a899..1d88904 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -59,6 +59,7 @@ def f(type): "Content-Type: application/http; msgtype=response\r\n" + "P3P: policyref=\"http://www.w3.org/2001/05/P3P/p3p.xml\"\r\n" + "Page.Ly: v4.1\r\n" + + "BadHeader: \n" + "WARC-Type: response\r\n" + "WARC-Record-ID: \r\n" + "WARC-Target-URI: http://example.com/\r\n" + @@ -75,6 +76,7 @@ def test_read_header1(self): assert h.record_id == "" assert h.type == "response" assert h.content_length == 10 + assert 'BadHeader' not in h def test_empty(self): reader = WARCReader(StringIO("")) diff --git a/warc/warc.py b/warc/warc.py index 2908006..c58ea1c 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -18,6 +18,8 @@ from . import gzip2 from .utils import CaseInsensitiveDict, FilePart +logger = logging.getLogger(__name__) + class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. @@ -340,10 +342,11 @@ def read_header(self, fileobj): if line == "\r\n": # end of headers break m = self.RE_HEADER.match(line) - if not m: - raise IOError("Bad header line: %r" % line) - name, value = m.groups() - headers[name] = value + if m: + name, value = m.groups() + headers[name] = value + else: + logger.warning("Bad header line: %r" % line) return WARCHeader(headers) def expect(self, fileobj, expected_line, message=None): From 649766f1e8ccf9731afe45ba61df6c2c4fb85186 Mon Sep 17 00:00:00 2001 From: Greg Thole Date: Thu, 17 Oct 2013 17:08:58 -0400 Subject: [PATCH 5/8] Simplified reader that works off delimiter --- warc/tests/test_warc.py | 32 ++++++++++++++++++++- warc/warc.py | 61 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 2 deletions(-) diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 1d88904..c5f6795 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -1,7 +1,9 @@ -from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile +from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile, \ + SimpleWARCReader from StringIO import StringIO + class TestWARCHeader: def test_attrs(self): h = WARCHeader({ @@ -95,6 +97,34 @@ def read_multiple_records(self): rec = reader.read_record() assert rec is not None + +class TestSimpleWARCReader: + def test_read_header1(self): + f = StringIO(SAMPLE_WARC_RECORD_TEXT) + h, b = SimpleWARCReader(f).read_record() + assert h['WARC-Date'] == "2012-02-10T16:15:52Z" + assert h['WARC-Record-ID'] == "" + assert h['WARC-Type'] == "response" + assert h['Content-Length'] == '10' + + def test_empty(self): + reader = WARCReader(StringIO("")) + assert reader.read_record() is None + + def test_read_record(self): + f = StringIO(SAMPLE_WARC_RECORD_TEXT) + reader = SimpleWARCReader(f) + headers, body = reader.read_record() + assert body == "Helloworld" + + def read_multiple_records(self): + f = StringIO(SAMPLE_WARC_RECORD_TEXT * 5) + reader = SimpleWARCReader(f) + for i in range(5): + rec = reader.read_record() + assert rec is not None + + class TestWarcFile: def test_read(self): f = WARCFile(fileobj=StringIO(SAMPLE_WARC_RECORD_TEXT)) diff --git a/warc/warc.py b/warc/warc.py index c58ea1c..0c528f5 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -314,7 +314,6 @@ def tell(self): else: return self.fileobj.tell() - class WARCReader: RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n") RE_HEADER = re.compile(r"([\w\-\.]+): *(.*)\r\n") @@ -395,3 +394,63 @@ def __iter__(self): while record is not None: yield record record = self.read_record() + + +class SimpleWARCReader(WARCReader): + RE_HEADER = re.compile(r"([\w\-]+): *(.*)\r?\n") + + def __init__(self, fileobj): + self.fileobj = fileobj + self.pos = 0 + + def __iter__(self): + return self + + def next(self): + record = self.read_record() + if record is None: + raise StopIteration + return record + + def read_record(self): + try: + self._read_version() + except AssertionError: + return + headers = self._read_header() + body = self._read_body() + return (headers, body) + + def _read_version(self): + self.fileobj.seek(self.pos) + line = self.fileobj.readline() + assert line == 'WARC/1.0\r\n' + + def _read_header(self): + headers = {} + while True: + line = self.fileobj.readline() + if line == "\r\n": # end of headers + break + m = self.RE_HEADER.match(line) + if not m: + logging.warning("Bad header line: %r" % line) + continue + name, value = m.groups() + headers[name] = value.strip() + return headers + + def _read_body(self): + body = '' + line = '' + while not (line == 'WARC/1.0\r\n' and body.endswith('\r\n\r\n')): + body += line + pos = self.fileobj.tell() + line = self.fileobj.readline() + if self.fileobj.tell() == pos: + break + self.pos = pos + return body.strip('\r\n') + + def close(self): + self.fileobj.close() From d24dcf55045e1dac2250e751f3a453e5466b4b31 Mon Sep 17 00:00:00 2001 From: Ben Homnick Date: Wed, 23 Oct 2013 05:58:19 +0800 Subject: [PATCH 6/8] generalized simple warc reader line reading and added fileobj and iterator versions --- warc/tests/test_warc.py | 40 +++++++++++++++++++++++++----- warc/warc.py | 54 +++++++++++++++++++++++++++++------------ 2 files changed, 73 insertions(+), 21 deletions(-) diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index c5f6795..cdf0573 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -1,5 +1,5 @@ from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile, \ - SimpleWARCReader + SimpleFileobjWARCReader, SimpleIteratorWARCReader from StringIO import StringIO @@ -98,28 +98,55 @@ def read_multiple_records(self): assert rec is not None -class TestSimpleWARCReader: +class TestSimpleFileobjWARCReader: def test_read_header1(self): f = StringIO(SAMPLE_WARC_RECORD_TEXT) - h, b = SimpleWARCReader(f).read_record() + h, b = SimpleFileobjWARCReader(f).read_record() assert h['WARC-Date'] == "2012-02-10T16:15:52Z" assert h['WARC-Record-ID'] == "" assert h['WARC-Type'] == "response" assert h['Content-Length'] == '10' def test_empty(self): - reader = WARCReader(StringIO("")) + reader = SimpleFileobjWARCReader(StringIO("")) assert reader.read_record() is None def test_read_record(self): f = StringIO(SAMPLE_WARC_RECORD_TEXT) - reader = SimpleWARCReader(f) + reader = SimpleFileobjWARCReader(f) headers, body = reader.read_record() assert body == "Helloworld" def read_multiple_records(self): f = StringIO(SAMPLE_WARC_RECORD_TEXT * 5) - reader = SimpleWARCReader(f) + reader = SimpleFileobjWARCReader(f) + for i in range(5): + rec = reader.read_record() + assert rec is not None + + +class TestSimpleIteratorWARCReader: + def test_read_header1(self): + i = [r + "\n" for r in SAMPLE_WARC_RECORD_TEXT.split("\n")] + h, b = SimpleIteratorWARCReader(iter(i)).read_record() + assert h['WARC-Date'] == "2012-02-10T16:15:52Z" + assert h['WARC-Record-ID'] == "" + assert h['WARC-Type'] == "response" + assert h['Content-Length'] == '10' + + def test_empty(self): + reader = SimpleIteratorWARCReader(iter([])) + assert reader.read_record() is None + + def test_read_record(self): + i = [r + "\n" for r in SAMPLE_WARC_RECORD_TEXT.split("\n")] + reader = SimpleIteratorWARCReader(iter(i)) + headers, body = reader.read_record() + assert body == "Helloworld" + + def read_multiple_records(self): + i = [r + "\n" for r in (SAMPLE_WARC_RECORD_TEXT * 5).split("\n")] + reader = SimpleIteratorWARCReader(iter(i)) for i in range(5): rec = reader.read_record() assert rec is not None @@ -152,5 +179,6 @@ def test_long_header(self): h = f.read_record().header assert h['WARC-Payload-Digest'] == "sha1:M4VJCCJQJKPACSSSBHURM572HSDQHO2P" + if __name__ == '__main__': TestWARCReader().test_read_header() diff --git a/warc/warc.py b/warc/warc.py index 0c528f5..53355f9 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -398,10 +398,11 @@ def __iter__(self): class SimpleWARCReader(WARCReader): RE_HEADER = re.compile(r"([\w\-]+): *(.*)\r?\n") + VERSION = "WARC/1.0\r\n" - def __init__(self, fileobj): - self.fileobj = fileobj - self.pos = 0 + def __init__(self, datasource): + self.datasource = datasource + self.stack = [] def __iter__(self): return self @@ -421,15 +422,22 @@ def read_record(self): body = self._read_body() return (headers, body) + def _next_line(self): + if self.stack: + return self.stack.pop() + return self._read_line() + + def _read_line(self): + raise NotImplementedError() + def _read_version(self): - self.fileobj.seek(self.pos) - line = self.fileobj.readline() - assert line == 'WARC/1.0\r\n' + line = self._next_line() + assert line == self.VERSION def _read_header(self): headers = {} while True: - line = self.fileobj.readline() + line = self._next_line() if line == "\r\n": # end of headers break m = self.RE_HEADER.match(line) @@ -441,16 +449,32 @@ def _read_header(self): return headers def _read_body(self): - body = '' - line = '' - while not (line == 'WARC/1.0\r\n' and body.endswith('\r\n\r\n')): + body = line = '' + while not (line == self.VERSION and body.endswith('\r\n\r\n')): body += line - pos = self.fileobj.tell() - line = self.fileobj.readline() - if self.fileobj.tell() == pos: + line = self._next_line() + if line == '': break - self.pos = pos return body.strip('\r\n') def close(self): - self.fileobj.close() + raise NotImplementedError() + + +class SimpleFileobjWARCReader(SimpleWARCReader): + def _read_line(self): + return self.datasource.readline() + + def close(self): + self.datasource.close() + + +class SimpleIteratorWARCReader(SimpleWARCReader): + def _read_line(self): + try: + return self.datasource.next() + except StopIteration: + return '' + + def close(self): + pass From c00423c379e280817ee528444b9427095ff96937 Mon Sep 17 00:00:00 2001 From: Ben Homnick Date: Thu, 24 Oct 2013 00:16:01 +0800 Subject: [PATCH 7/8] fixed bug where reader failed with bad linebreak format, added tests --- warc/tests/test_warc.py | 29 ++++++++++++++++++++++------- warc/warc.py | 5 ++++- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index cdf0573..acc07f6 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -61,7 +61,7 @@ def f(type): "Content-Type: application/http; msgtype=response\r\n" + "P3P: policyref=\"http://www.w3.org/2001/05/P3P/p3p.xml\"\r\n" + "Page.Ly: v4.1\r\n" + - "BadHeader: \n" + + "BadHeader%: \r\n" + "WARC-Type: response\r\n" + "WARC-Record-ID: \r\n" + "WARC-Target-URI: http://example.com/\r\n" + @@ -69,6 +69,8 @@ def f(type): "Helloworld" + "\r\n\r\n" ) +SAMPLE_WARC_RECORD_LIST = [r + "\r\n" for r in SAMPLE_WARC_RECORD_TEXT.split("\r\n")] + class TestWARCReader: def test_read_header1(self): @@ -124,11 +126,17 @@ def read_multiple_records(self): rec = reader.read_record() assert rec is not None + def test_bad_linebreaks(self): + f = StringIO(SAMPLE_WARC_RECORD_TEXT[:-2] * 5) + reader = SimpleFileobjWARCReader(f) + for i in range(5): + rec = reader.read_record() + assert rec is not None class TestSimpleIteratorWARCReader: def test_read_header1(self): - i = [r + "\n" for r in SAMPLE_WARC_RECORD_TEXT.split("\n")] - h, b = SimpleIteratorWARCReader(iter(i)).read_record() + i = iter(SAMPLE_WARC_RECORD_LIST) + h, b = SimpleIteratorWARCReader(i).read_record() assert h['WARC-Date'] == "2012-02-10T16:15:52Z" assert h['WARC-Record-ID'] == "" assert h['WARC-Type'] == "response" @@ -139,14 +147,21 @@ def test_empty(self): assert reader.read_record() is None def test_read_record(self): - i = [r + "\n" for r in SAMPLE_WARC_RECORD_TEXT.split("\n")] - reader = SimpleIteratorWARCReader(iter(i)) + i = iter(SAMPLE_WARC_RECORD_LIST) + reader = SimpleIteratorWARCReader(i) headers, body = reader.read_record() assert body == "Helloworld" def read_multiple_records(self): - i = [r + "\n" for r in (SAMPLE_WARC_RECORD_TEXT * 5).split("\n")] - reader = SimpleIteratorWARCReader(iter(i)) + i = iter(SAMPLE_WARC_RECORD_LIST * 5) + reader = SimpleIteratorWARCReader(i) + for i in range(5): + rec = reader.read_record() + assert rec is not None + + def test_bad_linebreaks(self): + f = (SAMPLE_WARC_RECORD_LIST[:-1] * 5) + reader = SimpleIteratorWARCReader(iter(f)) for i in range(5): rec = reader.read_record() assert rec is not None diff --git a/warc/warc.py b/warc/warc.py index 53355f9..f0779e2 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -453,7 +453,10 @@ def _read_body(self): while not (line == self.VERSION and body.endswith('\r\n\r\n')): body += line line = self._next_line() - if line == '': + if line == self.VERSION: + self.stack.append(self.VERSION) + break + elif line == '': # StringIO EOF break return body.strip('\r\n') From d5d6c879d2f475bc57ca87d69d5fe2e434deeeaf Mon Sep 17 00:00:00 2001 From: Ben Homnick Date: Thu, 24 Oct 2013 01:41:23 +0800 Subject: [PATCH 8/8] added bad header test case --- warc/tests/test_warc.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index acc07f6..1ceea71 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -54,6 +54,7 @@ def f(type): assert f("warcinfo")["Content-Type"] == "application/warc-fields" assert f("newtype")["Content-Type"] == "application/octet-stream" + SAMPLE_WARC_RECORD_TEXT = ( "WARC/1.0\r\n" + "Content-Length: 10\r\n" + @@ -62,14 +63,32 @@ def f(type): "P3P: policyref=\"http://www.w3.org/2001/05/P3P/p3p.xml\"\r\n" + "Page.Ly: v4.1\r\n" + "BadHeader%: \r\n" + + "BadHeader: \n" + "WARC-Type: response\r\n" + "WARC-Record-ID: \r\n" + "WARC-Target-URI: http://example.com/\r\n" + "\r\n" + "Helloworld" + - "\r\n\r\n" + "\r\n" + + "\r\n" ) -SAMPLE_WARC_RECORD_LIST = [r + "\r\n" for r in SAMPLE_WARC_RECORD_TEXT.split("\r\n")] +SAMPLE_WARC_RECORD_LIST = [ + "WARC/1.0\r\n", + "Content-Length: 10\r\n", + "WARC-Date: 2012-02-10T16:15:52Z\r\n", + "Content-Type: application/http; msgtype=response\r\n", + "P3P: policyref=\"http://www.w3.org/2001/05/P3P/p3p.xml\"\r\n", + "Page.Ly: v4.1\r\n", + "BadHeader%: \r\n", + "BadHeader: \n", + "WARC-Type: response\r\n", + "WARC-Record-ID: \r\n", + "WARC-Target-URI: http://example.com/\r\n", + "\r\n", + "Helloworld", + "\r\n", + "\r\n", +] class TestWARCReader: @@ -81,6 +100,7 @@ def test_read_header1(self): assert h.type == "response" assert h.content_length == 10 assert 'BadHeader' not in h + assert 'BadHeader%' not in h def test_empty(self): reader = WARCReader(StringIO(""))