diff --git a/src/databricks/sql/auth/retry.py b/src/databricks/sql/auth/retry.py index fd5184f2..b18edbbc 100644 --- a/src/databricks/sql/auth/retry.py +++ b/src/databricks/sql/auth/retry.py @@ -37,21 +37,50 @@ def get(cls, value: str): class DatabricksRetryPolicy(Retry): """ - - Implements the entirety of our retry policy across connectors. - Same as urllib3.Retry but implements retry_wait_min and retry_wait_max as part of its exponential backoff calculation - - Retry Policy arguments are defined in the docstring for our ThriftBackend class - - delay_default is the time in seconds the connector will wait between requests polling for a GetOperationStatus request. - - Retry.allowed_methods is set to POST - Retry.status_forcelist is set to codes 429 and 503 - - kwargs is used by urllib3 to pass state between successive Retry iterations. - - Some of these parameters are counters that change from one retry to the next - Others are like local constants that do not change between retries + Implements our v3 retry policy by extending urllib3's robust default retry behaviour. + + See `should_retry()` for details about we do and do not retry. + + :param delay_min: + Float of seconds for the minimum delay between retries. Passed to urllib3 as its + backoff_factor. + + :param delay_max: + Float of seconds for the maximum delay between retries. Passed to urllib3 as its + backoff_max + + :param stop_after_attempts_count: + Integer maximum number of attempts that will be retried. Passed to urllib3 as its + total. + + :param stop_after_attempts_duration: + Float of maximum number of seconds from the beginning of the first request that a + request may be retried. This behaviour is not implemented in urllib3. + + :param delay_default: + Float of seconds the connector will wait between sucessive GetOperationStatus + requests. This parameter is not used to retry failed network requests. We include + it in this class to keep all retry behaviour encapsulated in this file. + + :param force_dangerous_codes: + List of integer HTTP status codes that the connector will retry, even for dangerous + commands like ExecuteStatement. This is passed to urllib3 by extending its status_forcelist + + :param _retry_start_time: + Float unix timestamp. Used to monitor the overall request duration across successive + retries. Never set this value directly. Use self.start_retry_time() instead. Users + never set this value. It is set by ThriftBackend immediately before issuing a network + request. + + :param _command_type: + CommandType of the current request being retried. Used to modify retry behaviour based + on the type of Thrift command being issued. See self.should_retry() for details. Users + never set this value directly. It is set by ThriftBackend immediately before issuing + a network request. + + :param urllib3_kwargs: + Dictionary of arguments that are passed to Retry.__init__. Any setting of Retry() that + Databricks does not override or extend may be modified here. """ def __init__( @@ -130,7 +159,7 @@ def new(self, **urllib3_incremented_counters: typing.Any) -> Retry: The arguments it passes to `.new()` (total, connect, read, etc.) are those which modified by `.increment()`. Since our subclass has a new __init__ signature and requires custom state variables, we override the method - to pipe our Databricks-specific state while preserving the super-class's behaviour behaviour. + to pipe our Databricks-specific state while preserving the super-class's behaviour. """ # These arguments will match the function signature for self.__init__ @@ -196,7 +225,9 @@ def get_operation_status_delay(self) -> float: return self.delay_default def start_retry_timer(self): - """Timer is used to monitor the overall time across successive requests""" + """Timer is used to monitor the overall time across successive requests + + Should only be called by ThriftBackend before sending a Thrift command""" self._retry_start_time = time.time() def check_timer_duration(self): @@ -212,17 +243,13 @@ def check_proposed_wait(self, proposed_wait: Union[int, float]) -> None: f"Retry request would exceed Retry policy max retry duration of {self.stop_after_attempts_duration} seconds" ) - def get_backoff_time(self) -> float: - """Calls urllib3's built-in get_backoff_time but raises an exception if stop_after_attempts_duration would be exceeded. - If proposed backoff exceeds the configured delay_max, the delay_max is used instead""" - - proposed_backoff = super().get_backoff_time() - proposed_backoff = min(proposed_backoff, self.delay_max) - self.check_proposed_wait(proposed_backoff) + def sleep_for_retry(self, response: BaseHTTPResponse) -> bool: # type: ignore + """Sleeps for the duration specified in the response Retry-After header, if present - return proposed_backoff + A MaxRetryDurationError will be raised if doing so would exceed self.max_attempts_duration - def sleep_for_retry(self, response: BaseHTTPResponse) -> bool: # type: ignore + This method is only called by urllib3 internals. + """ retry_after = self.get_retry_after(response) if retry_after: self.check_proposed_wait(retry_after) @@ -231,18 +258,47 @@ def sleep_for_retry(self, response: BaseHTTPResponse) -> bool: # type: ignore return False - def should_retry(self, method: str, status_code: int) -> Tuple[bool, str]: - """We retry by default with the following exceptions: + def get_backoff_time(self) -> float: + """Calls urllib3's built-in get_backoff_time. - Any 200 status code -> Because the request succeeded - Any 501 status code -> Because it's not recoverable ever. Raise a NonRecoverableNetworkError. - Any 404 if the command was CancelOperation or CloseSession and this is not the first request - Any ExecuteStatement command unless it has code 429 or 504 + Never returns a value larger than self.delay_max + A MaxRetryDurationError will be raised if the calculated backoff would exceed self.max_attempts_duration + + Note: within urllib3, a backoff is only calculated in cases where a Retry-After header is not present + in the previous unsuccessful request. """ - # TODO: Port in the NoRetryReason enum for logging? - # TODO: something needs to go here about max wall-clock duration. - # It doesn't belong in the backoff calculation method + proposed_backoff = super().get_backoff_time() + proposed_backoff = min(proposed_backoff, self.delay_max) + self.check_proposed_wait(proposed_backoff) + + return proposed_backoff + + def should_retry(self, method: str, status_code: int) -> Tuple[bool, str]: + """This method encapsulates the connector's approach to retries. + + We always retry a request unless one of these conditions is met: + + 1. The request received a 200 (Success) status code + Because the request succeeded. No retry is required. + 2. The request received a 501 (Not Implemented) status code + Because this request can never succeed. + 3. The request received a 404 (Not Found) code and the request CommandType + was CloseSession or CloseOperation. This code indicates that the session + or cursor was already closed. Further retries will always return the same + code. + 4. The request CommandType was ExecuteStatement and the HTTP code does not + appear in the default status_forcelist or force_dangerous_codes list. By + default, thisd means ExecuteStatement is only retried for codes 429 and 503. + This limit prevents automatically retrying non-idempotent commands that could + be destructive. + + 5. OSErrors (handled automatically by urllib3 outside this method) + 6. Redirects (handled automatically by urllib3 outside this method) + + Returns True if the request should be retried. Returns False or raises an exception + if a retry would violate the configured policy. + """ # Request succeeded. Don't retry. if status_code == 200: @@ -309,7 +365,9 @@ def is_retry( self, method: str, status_code: int, has_retry_after: bool = False ) -> bool: """ - Overriden to support varied behaviour based on the current CommandType + Called by urllib3 when determining whether or not to retry + + Logs a debug message if the request will be retried """ should_retry, msg = self.should_retry(method, status_code) diff --git a/src/databricks/sql/thrift_backend.py b/src/databricks/sql/thrift_backend.py index 275204a6..79552f6c 100644 --- a/src/databricks/sql/thrift_backend.py +++ b/src/databricks/sql/thrift_backend.py @@ -220,6 +220,7 @@ def __init__( self._request_lock = threading.RLock() + # TODO: Move this bounding logic into DatabricksRetryPolicy for v3 (PECO-918) def _initialize_retry_args(self, kwargs): # Configure retries & timing: use user-settings or defaults, and bound # by policy. Log.warn when given param gets restricted.