diff --git a/dpdispatcher/dp_cloud_server_context.py b/dpdispatcher/dp_cloud_server_context.py index db23f44d..f10bc2e0 100644 --- a/dpdispatcher/dp_cloud_server_context.py +++ b/dpdispatcher/dp_cloud_server_context.py @@ -307,7 +307,7 @@ def machine_subfields(cls) -> List[Argument]: "retry_count", [int, type(None)], optional=True, - default=3, + default=2, doc=doc_retry_count, ), Argument( diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 9b8301aa..c1f9ac6b 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -841,7 +841,7 @@ def handle_unexpected_job_state(self): retry_count = 3 assert self.machine is not None if hasattr(self.machine, "retry_count") and self.machine.retry_count > 0: - retry_count = self.machine.retry_count + retry_count = self.machine.retry_count + 1 if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): raise RuntimeError( f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times.job_detail:{self}"