From c981c54cb1d3001a83174c3a1b8688d9943f3f59 Mon Sep 17 00:00:00 2001
From: Patrick Avery <patrick.avery@kitware.com>
Date: Mon, 29 Jan 2024 12:40:44 -0600
Subject: [PATCH] Infer DICOM file size, when possible

Google Healthcare API supports single-part requests for DICOM files.
Single-part requests are not officially supported by the DICOMweb standard,
but they are easier to work with. Google Healthcare API also provides
the Content-Length. We can utilize this information to infer the DICOM file
size without having to stream the whole file.

This PR adds logic to automatically infer the DICOM file size and set
it, when possible, without streaming the whole file.

Unfortunately, dcm4chee does not support single-part requests, and they do
not return a Content-Length either. So we must still stream the whole file
to get the file size for dcm4chee servers. But if we determine a new way
to infer the file size, we can add it as well.

Signed-off-by: Patrick Avery <patrick.avery@kitware.com>
---
 .../assetstore/dicomweb_assetstore_adapter.py | 165 ++++++++++++++----
 1 file changed, 131 insertions(+), 34 deletions(-)

diff --git a/sources/dicom/large_image_source_dicom/assetstore/dicomweb_assetstore_adapter.py b/sources/dicom/large_image_source_dicom/assetstore/dicomweb_assetstore_adapter.py
index 368270f37..123cc4cbf 100644
--- a/sources/dicom/large_image_source_dicom/assetstore/dicomweb_assetstore_adapter.py
+++ b/sources/dicom/large_image_source_dicom/assetstore/dicomweb_assetstore_adapter.py
@@ -139,41 +139,14 @@ def setContentHeaders(self, file, offset, endByte, contentDisposition=None):
     def downloadFile(self, file, offset=0, headers=True, endByte=None,
                      contentDisposition=None, extraParameters=None, **kwargs):
 
-        from dicomweb_client.web import _Transaction
-
-        dicom_uids = file['dicom_uids']
-        study_uid = dicom_uids['study_uid']
-        series_uid = dicom_uids['series_uid']
-        instance_uid = dicom_uids['instance_uid']
-
-        client = _create_dicomweb_client(self.assetstore_meta)
-
         if headers:
             setResponseHeader('Accept-Ranges', 'bytes')
             self.setContentHeaders(file, offset, endByte, contentDisposition)
 
-        # Create the URL
-        url = client._get_instances_url(
-            _Transaction.RETRIEVE,
-            study_uid,
-            series_uid,
-            instance_uid,
-        )
-
-        # Build the headers
-        transfer_syntax = '*'
-        accept_parts = [
-            'multipart/related',
-            'type="application/dicom"',
-            f'transfer-syntax={transfer_syntax}',
-        ]
-        request_headers = {
-            'Accept': '; '.join(accept_parts),
-        }
-
         def stream():
             # Perform the request
-            response = client._http_get(url, headers=request_headers, stream=True)
+            # Try a single-part download first. If that doesn't work, do multipart.
+            response = self._request_retrieve_instance_prefer_singlepart(file)
 
             bytes_read = 0
             for chunk in self._stream_retrieve_instance_response(response):
@@ -203,6 +176,76 @@ def stream():
 
         return stream
 
+    def _request_retrieve_instance_prefer_singlepart(self, file, transfer_syntax='*'):
+        # Try to perform a singlepart request. If it fails, perform a multipart request
+        # instead.
+        response = None
+        try:
+            response = self._request_retrieve_instance(file, multipart=False,
+                                                       transfer_syntax=transfer_syntax)
+        except requests.HTTPError:
+            # If there is an HTTPError, the server might not accept single-part requests...
+            pass
+
+        if self._is_singlepart_response(response):
+            return response
+
+        # Perform the multipart request instead
+        return self._request_retrieve_instance(file, transfer_syntax=transfer_syntax)
+
+    def _request_retrieve_instance(self, file, multipart=True, transfer_syntax='*'):
+        # Multipart requests are officially supported by the DICOMweb standard.
+        # Singlepart requests are not officially supported, but they are easier
+        # to work with.
+        # Google Healthcare API support it.
+        # See here: https://cloud.google.com/healthcare-api/docs/dicom#dicom_instances
+
+        # Create the URL
+        client = _create_dicomweb_client(self.assetstore_meta)
+        url = self._create_retrieve_instance_url(client, file)
+
+        # Build the headers
+        headers = {}
+        if multipart:
+            # This is officially supported by the DICOMweb standard.
+            headers['Accept'] = '; '.join((
+                'multipart/related',
+                'type="application/dicom"',
+                f'transfer-syntax={transfer_syntax}',
+            ))
+        else:
+            # This is not officially supported by the DICOMweb standard,
+            # but it is easier to work with, and some servers such as
+            # Google Healthcare API support it.
+            # See here: https://cloud.google.com/healthcare-api/docs/dicom#dicom_instances
+            headers['Accept'] = f'application/dicom; transfer-syntax={transfer_syntax}'
+
+        return client._http_get(url, headers=headers, stream=True)
+
+    def _create_retrieve_instance_url(self, client, file):
+        from dicomweb_client.web import _Transaction
+
+        dicom_uids = file['dicom_uids']
+        study_uid = dicom_uids['study_uid']
+        series_uid = dicom_uids['series_uid']
+        instance_uid = dicom_uids['instance_uid']
+
+        return client._get_instances_url(
+            _Transaction.RETRIEVE,
+            study_uid,
+            series_uid,
+            instance_uid,
+        )
+
+    def _stream_retrieve_instance_response(self, response):
+        # Check if the original request asked for multipart data
+        if 'multipart/related' in response.request.headers.get('Accept', ''):
+            yield from self._stream_dicom_multipart_response(response)
+        else:
+            # The content should *only* contain the DICOM file
+            with response:
+                yield from response.iter_content(BUF_SIZE)
+
     def _extract_media_type_and_boundary(self, response):
         content_type = response.headers['content-type']
         media_type, *ct_info = (ct.strip() for ct in content_type.split(';'))
@@ -215,7 +258,7 @@ def _extract_media_type_and_boundary(self, response):
 
         return media_type, boundary
 
-    def _stream_retrieve_instance_response(self, response):
+    def _stream_dicom_multipart_response(self, response):
         # The first part of this function was largely copied from dicomweb-client's
         # _decode_multipart_message() function. But we can't use that function here
         # because it relies on reading the whole DICOM file into memory. We want to
@@ -307,6 +350,50 @@ def _stream_retrieve_instance_response(self, response):
             msg = 'Failed to find ending boundary in response content'
             raise ValueError(msg)
 
+    def _infer_file_size(self, file):
+        # Try various methods to infer the file size, without streaming the
+        # whole file. Returns the file size if successful, or `None` if unsuccessful.
+        if file.get('size') is not None:
+            # The file size was already determined.
+            return file['size']
+
+        # Only method currently is inferring from single-part content_length
+        return self._infer_file_size_singlepart_content_length(file)
+
+    def _is_singlepart_response(self, response):
+        if response is None:
+            return False
+
+        content_type = response.headers.get('Content-Type')
+        return (
+            response.status_code == 200 and
+            not any(x in content_type for x in ('multipart/related', 'boundary'))
+        )
+
+    def _infer_file_size_singlepart_content_length(self, file):
+        # First, try to see if single-part requests work, and if the Content-Length
+        # is returned. This works for Google Healthcare API.
+        try:
+            response = self._request_retrieve_instance(file, multipart=False)
+        except requests.HTTPError:
+            # If there is an HTTPError, the server might not accept single-part requests...
+            return
+
+        if not self._is_singlepart_response(response):
+            # Does not support single-part requests...
+            return
+
+        content_length = response.headers.get('Content-Length')
+        if not content_length:
+            # The server did not return a Content-Length
+            return
+
+        try:
+            # The DICOM file size is equal to the Content-Length
+            return int(content_length)
+        except ValueError:
+            return
+
     def importData(self, parent, parentType, params, progress, user, **kwargs):
         """
         Import DICOMweb WSI instances from a DICOMweb server.
@@ -408,7 +495,10 @@ def importData(self, parent, parentType, params, progress, user, **kwargs):
                     'instance_uid': instance_uid,
                 }
                 file['imported'] = True
-                File().save(file)
+
+                # Try to infer the file size without streaming, if possible.
+                file['size'] = self._infer_file_size(file)
+                file = File().save(file)
 
             items.append(item)
 
@@ -420,16 +510,23 @@ def auth_session(self):
 
     def getFileSize(self, file):
         # This function will compute the size of the DICOM file (a potentially
-        # expensive operation, since it may have to stream the whole file),
-        # and cache the result in file['size'].
+        # expensive operation, since it may have to stream the whole file).
+        # The caller is expected to cache the result in file['size'].
         # This function is called when the size is needed, such as the girder
         # fuse mount code, and range requests.
         if file.get('size') is not None:
             # It has already been computed once. Return the cached size.
             return file['size']
 
+        # Try to infer the file size without streaming, if possible.
+        size = self._infer_file_size(file)
+        if size:
+            return size
+
+        # We must stream the whole file to get the file size...
         size = 0
-        for chunk in self.downloadFile(file, headers=False)():
+        response = self._request_retrieve_instance_prefer_singlepart(file)
+        for chunk in self._stream_retrieve_instance_response(response):
             size += len(chunk)
 
         # This should get cached in file['size'] in File().updateSize().