From b842cf4cc7e4e22caa3110157cec9b4959582993 Mon Sep 17 00:00:00 2001 From: Ian Carroll Date: Tue, 24 Sep 2024 10:19:44 -0400 Subject: [PATCH] http file system directed to stream by an "Accept-Ranges": "none" response (#1631) --- fsspec/implementations/http.py | 18 ++++++++------ fsspec/implementations/tests/test_http.py | 8 ++++-- fsspec/tests/conftest.py | 30 +++++++++++------------ 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index 7b5a38bb3..47dfb88f9 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -358,9 +358,10 @@ def _open( kw = self.kwargs.copy() kw["asynchronous"] = self.asynchronous kw.update(kwargs) - size = size or self.info(path, **kwargs)["size"] + info = {} + size = size or info.update(self.info(path, **kwargs)) or info["size"] session = sync(self.loop, self.set_session) - if block_size and size: + if block_size and size and info.get("partial", True): return HTTPFile( self, path, @@ -520,9 +521,9 @@ async def _isdir(self, path): class HTTPFile(AbstractBufferedFile): """ - A file-like object pointing to a remove HTTP(S) resource + A file-like object pointing to a remote HTTP(S) resource - Supports only reading, with read-ahead of a predermined block-size. + Supports only reading, with read-ahead of a predetermined block-size. In the case that the server does not supply the filesize, only reading of the complete file in one go is supported. @@ -835,10 +836,6 @@ async def _file_info(url, session, size_policy="head", **kwargs): async with r: r.raise_for_status() - # TODO: - # recognise lack of 'Accept-Ranges', - # or 'Accept-Ranges': 'none' (not 'bytes') - # to mean streaming only, no random access => return None if "Content-Length" in r.headers: # Some servers may choose to ignore Accept-Encoding and return # compressed content, in which case the returned size is unreliable. @@ -853,6 +850,11 @@ async def _file_info(url, session, size_policy="head", **kwargs): if "Content-Type" in r.headers: info["mimetype"] = r.headers["Content-Type"].partition(";")[0] + if r.headers.get("Accept-Ranges") == "none": + # Some servers may explicitly discourage partial content requests, but + # the lack of "Accept-Ranges" does not always indicate they would fail + info["partial"] = False + info["url"] = str(r.url) for checksum_field in ["ETag", "Content-MD5", "Digest"]: diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py index 81e438a81..568284e22 100644 --- a/fsspec/implementations/tests/test_http.py +++ b/fsspec/implementations/tests/test_http.py @@ -237,9 +237,13 @@ def test_random_access(server, headers): @pytest.mark.parametrize( "headers", [ - {"ignore_range": "true", "head_ok": "true", "head_give_length": "true"}, + # HTTPFile seeks, response headers lack size, assumed no range support + {"head_ok": "true", "head_give_length": "true"}, + # HTTPFile seeks, response is not a range {"ignore_range": "true", "give_length": "true"}, {"ignore_range": "true", "give_range": "true"}, + # HTTPStreamFile does not seek (past 0) + {"accept_range": "none", "head_ok": "true", "give_length": "true"}, ], ) def test_no_range_support(server, headers): @@ -247,8 +251,8 @@ def test_no_range_support(server, headers): url = server + "/index/realfile" with h.open(url, "rb") as f: # Random access is not possible if the server doesn't respect Range - f.seek(5) with pytest.raises(ValueError): + f.seek(5) f.read(10) # Reading from the beginning should still work diff --git a/fsspec/tests/conftest.py b/fsspec/tests/conftest.py index fb1efb041..413ed717b 100644 --- a/fsspec/tests/conftest.py +++ b/fsspec/tests/conftest.py @@ -135,10 +135,10 @@ def read_chunks(self): self.rfile.readline() def do_HEAD(self): + r_headers = {} if "head_not_auth" in self.headers: - return self._respond( - 403, {"Content-Length": 123}, b"not authorized for HEAD request" - ) + r_headers["Content-Length"] = 123 + return self._respond(403, r_headers, b"not authorized for HEAD request") elif "head_ok" not in self.headers: return self._respond(405) @@ -148,23 +148,23 @@ def do_HEAD(self): return self._respond(404) if ("give_length" in self.headers) or ("head_give_length" in self.headers): - response_headers = {"Content-Length": len(file_data)} if "zero_length" in self.headers: - response_headers["Content-Length"] = 0 + r_headers["Content-Length"] = 0 elif "gzip_encoding" in self.headers: file_data = gzip.compress(file_data) - response_headers["Content-Encoding"] = "gzip" - response_headers["Content-Length"] = len(file_data) - - self._respond(200, response_headers) + r_headers["Content-Encoding"] = "gzip" + r_headers["Content-Length"] = len(file_data) + else: + r_headers["Content-Length"] = len(file_data) elif "give_range" in self.headers: - self._respond( - 200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"} - ) + r_headers["Content-Range"] = f"0-{len(file_data) - 1}/{len(file_data)}" elif "give_etag" in self.headers: - self._respond(200, {"ETag": "xxx"}) - else: - self._respond(200) # OK response, but no useful info + r_headers["ETag"] = "xxx" + + if self.headers.get("accept_range") == "none": + r_headers["Accept-Ranges"] = "none" + + self._respond(200, r_headers) @contextlib.contextmanager