Skip to content

Commit

Permalink
http file system directed to stream by an "Accept-Ranges": "none" res…
Browse files Browse the repository at this point in the history
…ponse (#1631)
  • Loading branch information
itcarroll authored Sep 24, 2024
1 parent 1f61512 commit b842cf4
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 25 deletions.
18 changes: 10 additions & 8 deletions fsspec/implementations/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,9 +358,10 @@ def _open(
kw = self.kwargs.copy()
kw["asynchronous"] = self.asynchronous
kw.update(kwargs)
size = size or self.info(path, **kwargs)["size"]
info = {}
size = size or info.update(self.info(path, **kwargs)) or info["size"]
session = sync(self.loop, self.set_session)
if block_size and size:
if block_size and size and info.get("partial", True):
return HTTPFile(
self,
path,
Expand Down Expand Up @@ -520,9 +521,9 @@ async def _isdir(self, path):

class HTTPFile(AbstractBufferedFile):
"""
A file-like object pointing to a remove HTTP(S) resource
A file-like object pointing to a remote HTTP(S) resource
Supports only reading, with read-ahead of a predermined block-size.
Supports only reading, with read-ahead of a predetermined block-size.
In the case that the server does not supply the filesize, only reading of
the complete file in one go is supported.
Expand Down Expand Up @@ -835,10 +836,6 @@ async def _file_info(url, session, size_policy="head", **kwargs):
async with r:
r.raise_for_status()

# TODO:
# recognise lack of 'Accept-Ranges',
# or 'Accept-Ranges': 'none' (not 'bytes')
# to mean streaming only, no random access => return None
if "Content-Length" in r.headers:
# Some servers may choose to ignore Accept-Encoding and return
# compressed content, in which case the returned size is unreliable.
Expand All @@ -853,6 +850,11 @@ async def _file_info(url, session, size_policy="head", **kwargs):
if "Content-Type" in r.headers:
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]

if r.headers.get("Accept-Ranges") == "none":
# Some servers may explicitly discourage partial content requests, but
# the lack of "Accept-Ranges" does not always indicate they would fail
info["partial"] = False

info["url"] = str(r.url)

for checksum_field in ["ETag", "Content-MD5", "Digest"]:
Expand Down
8 changes: 6 additions & 2 deletions fsspec/implementations/tests/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,18 +237,22 @@ def test_random_access(server, headers):
@pytest.mark.parametrize(
"headers",
[
{"ignore_range": "true", "head_ok": "true", "head_give_length": "true"},
# HTTPFile seeks, response headers lack size, assumed no range support
{"head_ok": "true", "head_give_length": "true"},
# HTTPFile seeks, response is not a range
{"ignore_range": "true", "give_length": "true"},
{"ignore_range": "true", "give_range": "true"},
# HTTPStreamFile does not seek (past 0)
{"accept_range": "none", "head_ok": "true", "give_length": "true"},
],
)
def test_no_range_support(server, headers):
h = fsspec.filesystem("http", headers=headers)
url = server + "/index/realfile"
with h.open(url, "rb") as f:
# Random access is not possible if the server doesn't respect Range
f.seek(5)
with pytest.raises(ValueError):
f.seek(5)
f.read(10)

# Reading from the beginning should still work
Expand Down
30 changes: 15 additions & 15 deletions fsspec/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ def read_chunks(self):
self.rfile.readline()

def do_HEAD(self):
r_headers = {}
if "head_not_auth" in self.headers:
return self._respond(
403, {"Content-Length": 123}, b"not authorized for HEAD request"
)
r_headers["Content-Length"] = 123
return self._respond(403, r_headers, b"not authorized for HEAD request")
elif "head_ok" not in self.headers:
return self._respond(405)

Expand All @@ -148,23 +148,23 @@ def do_HEAD(self):
return self._respond(404)

if ("give_length" in self.headers) or ("head_give_length" in self.headers):
response_headers = {"Content-Length": len(file_data)}
if "zero_length" in self.headers:
response_headers["Content-Length"] = 0
r_headers["Content-Length"] = 0
elif "gzip_encoding" in self.headers:
file_data = gzip.compress(file_data)
response_headers["Content-Encoding"] = "gzip"
response_headers["Content-Length"] = len(file_data)

self._respond(200, response_headers)
r_headers["Content-Encoding"] = "gzip"
r_headers["Content-Length"] = len(file_data)
else:
r_headers["Content-Length"] = len(file_data)
elif "give_range" in self.headers:
self._respond(
200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"}
)
r_headers["Content-Range"] = f"0-{len(file_data) - 1}/{len(file_data)}"
elif "give_etag" in self.headers:
self._respond(200, {"ETag": "xxx"})
else:
self._respond(200) # OK response, but no useful info
r_headers["ETag"] = "xxx"

if self.headers.get("accept_range") == "none":
r_headers["Accept-Ranges"] = "none"

self._respond(200, r_headers)


@contextlib.contextmanager
Expand Down

0 comments on commit b842cf4

Please sign in to comment.