From b842cf4cc7e4e22caa3110157cec9b4959582993 Mon Sep 17 00:00:00 2001
From: Ian Carroll <carroll.ian@gmail.com>
Date: Tue, 24 Sep 2024 10:19:44 -0400
Subject: [PATCH] http file system directed to stream by an "Accept-Ranges":
 "none" response (#1631)

---
 fsspec/implementations/http.py            | 18 ++++++++------
 fsspec/implementations/tests/test_http.py |  8 ++++--
 fsspec/tests/conftest.py                  | 30 +++++++++++------------
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py
index 7b5a38bb3..47dfb88f9 100644
--- a/fsspec/implementations/http.py
+++ b/fsspec/implementations/http.py
@@ -358,9 +358,10 @@ def _open(
         kw = self.kwargs.copy()
         kw["asynchronous"] = self.asynchronous
         kw.update(kwargs)
-        size = size or self.info(path, **kwargs)["size"]
+        info = {}
+        size = size or info.update(self.info(path, **kwargs)) or info["size"]
         session = sync(self.loop, self.set_session)
-        if block_size and size:
+        if block_size and size and info.get("partial", True):
             return HTTPFile(
                 self,
                 path,
@@ -520,9 +521,9 @@ async def _isdir(self, path):
 
 class HTTPFile(AbstractBufferedFile):
     """
-    A file-like object pointing to a remove HTTP(S) resource
+    A file-like object pointing to a remote HTTP(S) resource
 
-    Supports only reading, with read-ahead of a predermined block-size.
+    Supports only reading, with read-ahead of a predetermined block-size.
 
     In the case that the server does not supply the filesize, only reading of
     the complete file in one go is supported.
@@ -835,10 +836,6 @@ async def _file_info(url, session, size_policy="head", **kwargs):
     async with r:
         r.raise_for_status()
 
-        # TODO:
-        #  recognise lack of 'Accept-Ranges',
-        #                 or 'Accept-Ranges': 'none' (not 'bytes')
-        #  to mean streaming only, no random access => return None
         if "Content-Length" in r.headers:
             # Some servers may choose to ignore Accept-Encoding and return
             # compressed content, in which case the returned size is unreliable.
@@ -853,6 +850,11 @@ async def _file_info(url, session, size_policy="head", **kwargs):
         if "Content-Type" in r.headers:
             info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
 
+        if r.headers.get("Accept-Ranges") == "none":
+            # Some servers may explicitly discourage partial content requests, but
+            # the lack of "Accept-Ranges" does not always indicate they would fail
+            info["partial"] = False
+
         info["url"] = str(r.url)
 
         for checksum_field in ["ETag", "Content-MD5", "Digest"]:
diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py
index 81e438a81..568284e22 100644
--- a/fsspec/implementations/tests/test_http.py
+++ b/fsspec/implementations/tests/test_http.py
@@ -237,9 +237,13 @@ def test_random_access(server, headers):
 @pytest.mark.parametrize(
     "headers",
     [
-        {"ignore_range": "true", "head_ok": "true", "head_give_length": "true"},
+        # HTTPFile seeks, response headers lack size, assumed no range support
+        {"head_ok": "true", "head_give_length": "true"},
+        # HTTPFile seeks, response is not a range
         {"ignore_range": "true", "give_length": "true"},
         {"ignore_range": "true", "give_range": "true"},
+        # HTTPStreamFile does not seek (past 0)
+        {"accept_range": "none", "head_ok": "true", "give_length": "true"},
     ],
 )
 def test_no_range_support(server, headers):
@@ -247,8 +251,8 @@ def test_no_range_support(server, headers):
     url = server + "/index/realfile"
     with h.open(url, "rb") as f:
         # Random access is not possible if the server doesn't respect Range
-        f.seek(5)
         with pytest.raises(ValueError):
+            f.seek(5)
             f.read(10)
 
         # Reading from the beginning should still work
diff --git a/fsspec/tests/conftest.py b/fsspec/tests/conftest.py
index fb1efb041..413ed717b 100644
--- a/fsspec/tests/conftest.py
+++ b/fsspec/tests/conftest.py
@@ -135,10 +135,10 @@ def read_chunks(self):
             self.rfile.readline()
 
     def do_HEAD(self):
+        r_headers = {}
         if "head_not_auth" in self.headers:
-            return self._respond(
-                403, {"Content-Length": 123}, b"not authorized for HEAD request"
-            )
+            r_headers["Content-Length"] = 123
+            return self._respond(403, r_headers, b"not authorized for HEAD request")
         elif "head_ok" not in self.headers:
             return self._respond(405)
 
@@ -148,23 +148,23 @@ def do_HEAD(self):
             return self._respond(404)
 
         if ("give_length" in self.headers) or ("head_give_length" in self.headers):
-            response_headers = {"Content-Length": len(file_data)}
             if "zero_length" in self.headers:
-                response_headers["Content-Length"] = 0
+                r_headers["Content-Length"] = 0
             elif "gzip_encoding" in self.headers:
                 file_data = gzip.compress(file_data)
-                response_headers["Content-Encoding"] = "gzip"
-                response_headers["Content-Length"] = len(file_data)
-
-            self._respond(200, response_headers)
+                r_headers["Content-Encoding"] = "gzip"
+                r_headers["Content-Length"] = len(file_data)
+            else:
+                r_headers["Content-Length"] = len(file_data)
         elif "give_range" in self.headers:
-            self._respond(
-                200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"}
-            )
+            r_headers["Content-Range"] = f"0-{len(file_data) - 1}/{len(file_data)}"
         elif "give_etag" in self.headers:
-            self._respond(200, {"ETag": "xxx"})
-        else:
-            self._respond(200)  # OK response, but no useful info
+            r_headers["ETag"] = "xxx"
+
+        if self.headers.get("accept_range") == "none":
+            r_headers["Accept-Ranges"] = "none"
+
+        self._respond(200, r_headers)
 
 
 @contextlib.contextmanager