Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

http file system directed to stream by an "Accept-Ranges": "none" response #1631

Merged
merged 6 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions fsspec/implementations/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,9 +358,10 @@ def _open(
kw = self.kwargs.copy()
kw["asynchronous"] = self.asynchronous
kw.update(kwargs)
size = size or self.info(path, **kwargs)["size"]
info = {}
size = size or info.update(self.info(path, **kwargs)) or info["size"]
session = sync(self.loop, self.set_session)
if block_size and size:
if block_size and size and info.get("partial", True):
return HTTPFile(
self,
path,
Expand Down Expand Up @@ -520,9 +521,9 @@ async def _isdir(self, path):

class HTTPFile(AbstractBufferedFile):
"""
A file-like object pointing to a remove HTTP(S) resource
A file-like object pointing to a remote HTTP(S) resource

Supports only reading, with read-ahead of a predermined block-size.
Supports only reading, with read-ahead of a predetermined block-size.

In the case that the server does not supply the filesize, only reading of
the complete file in one go is supported.
Expand Down Expand Up @@ -835,10 +836,6 @@ async def _file_info(url, session, size_policy="head", **kwargs):
async with r:
r.raise_for_status()

# TODO:
# recognise lack of 'Accept-Ranges',
# or 'Accept-Ranges': 'none' (not 'bytes')
# to mean streaming only, no random access => return None
if "Content-Length" in r.headers:
# Some servers may choose to ignore Accept-Encoding and return
# compressed content, in which case the returned size is unreliable.
Expand All @@ -853,6 +850,11 @@ async def _file_info(url, session, size_policy="head", **kwargs):
if "Content-Type" in r.headers:
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]

if r.headers.get("Accept-Ranges") == "none":
# Some servers may explicitly discourage partial content requests, but
# the lack of "Accept-Ranges" does not always indicate they would fail
info["partial"] = False

info["url"] = str(r.url)

for checksum_field in ["ETag", "Content-MD5", "Digest"]:
Expand Down
8 changes: 6 additions & 2 deletions fsspec/implementations/tests/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,18 +237,22 @@ def test_random_access(server, headers):
@pytest.mark.parametrize(
"headers",
[
{"ignore_range": "true", "head_ok": "true", "head_give_length": "true"},
# HTTPFile seeks, response headers lack size, assumed no range support
{"head_ok": "true", "head_give_length": "true"},
# HTTPFile seeks, response is not a range
{"ignore_range": "true", "give_length": "true"},
{"ignore_range": "true", "give_range": "true"},
# HTTPStreamFile does not seek (past 0)
{"accept_range": "none", "head_ok": "true", "give_length": "true"},
],
)
def test_no_range_support(server, headers):
h = fsspec.filesystem("http", headers=headers)
url = server + "/index/realfile"
with h.open(url, "rb") as f:
# Random access is not possible if the server doesn't respect Range
f.seek(5)
with pytest.raises(ValueError):
f.seek(5)
f.read(10)

# Reading from the beginning should still work
Expand Down
30 changes: 15 additions & 15 deletions fsspec/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ def read_chunks(self):
self.rfile.readline()

def do_HEAD(self):
r_headers = {}
if "head_not_auth" in self.headers:
return self._respond(
403, {"Content-Length": 123}, b"not authorized for HEAD request"
)
r_headers["Content-Length"] = 123
return self._respond(403, r_headers, b"not authorized for HEAD request")
elif "head_ok" not in self.headers:
return self._respond(405)

Expand All @@ -148,23 +148,23 @@ def do_HEAD(self):
return self._respond(404)

if ("give_length" in self.headers) or ("head_give_length" in self.headers):
response_headers = {"Content-Length": len(file_data)}
if "zero_length" in self.headers:
response_headers["Content-Length"] = 0
r_headers["Content-Length"] = 0
elif "gzip_encoding" in self.headers:
file_data = gzip.compress(file_data)
response_headers["Content-Encoding"] = "gzip"
response_headers["Content-Length"] = len(file_data)

self._respond(200, response_headers)
r_headers["Content-Encoding"] = "gzip"
r_headers["Content-Length"] = len(file_data)
else:
r_headers["Content-Length"] = len(file_data)
elif "give_range" in self.headers:
self._respond(
200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"}
)
r_headers["Content-Range"] = f"0-{len(file_data) - 1}/{len(file_data)}"
elif "give_etag" in self.headers:
self._respond(200, {"ETag": "xxx"})
else:
self._respond(200) # OK response, but no useful info
r_headers["ETag"] = "xxx"

if self.headers.get("accept_range") == "none":
r_headers["Accept-Ranges"] = "none"

self._respond(200, r_headers)


@contextlib.contextmanager
Expand Down
Loading