Skip to content

Commit

Permalink
Merge pull request #2232 from FedML-AI/raphael/quick-fix-error-catch
Browse files Browse the repository at this point in the history
[Deploy] Edge Case Handling.
  • Loading branch information
fedml-alex authored Nov 11, 2024
2 parents 9fc5b4d + a108a8a commit 98e084a
Show file tree
Hide file tree
Showing 5 changed files with 308 additions and 223 deletions.
5 changes: 3 additions & 2 deletions python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,15 +511,16 @@ def _get_mqtt_service():


def set_local_on_premise_platform_host(local_on_premise_platform_host):
os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST'] = local_on_premise_platform_host
# Should Also update the .env file
set_env_kv("FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST", local_on_premise_platform_host)


def get_local_on_premise_platform_host():
return os.environ.get('FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST', "127.0.0.1")


def set_local_on_premise_platform_port(local_on_premise_platform_port):
os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT'] = str(local_on_premise_platform_port)
set_env_kv("FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT", str(local_on_premise_platform_port))


def get_local_on_premise_platform_port():
Expand Down
53 changes: 33 additions & 20 deletions python/fedml/computing/scheduler/comm_utils/run_process_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,26 +166,39 @@ def generate_yaml_doc(run_config_object, yaml_file):
@staticmethod
def get_pid_from_cmd_line(cmd_line, break_on_first=True):
ret_pids = list()
pids = psutil.process_iter()
for pid in pids:
try:
for cmd in pid.cmdline():
if cmd.find(cmd_line) != -1:
is_running = False
try:
process = psutil.Process(pid.pid)
if process.status() == psutil.STATUS_RUNNING or \
process.status() == psutil.STATUS_SLEEPING or \
process.status() == psutil.STATUS_IDLE:
is_running = True
except Exception as e:
pass
if is_running:
ret_pids.append(pid.pid)
if break_on_first:
return ret_pids
except Exception as e:
pass
try:
for pid in psutil.process_iter():
try:
try:
_ = pid.as_dict(attrs=['cpu_times', 'name', 'pid', 'status'])
except psutil.ZombieProcess:
# Filter out zombie processes
continue
except psutil.NoSuchProcess:
continue

for cmd in pid.cmdline():
if cmd.find(cmd_line) != -1:
is_running = False
try:
process = psutil.Process(pid.pid)
if process.status() == psutil.STATUS_RUNNING or \
process.status() == psutil.STATUS_SLEEPING or \
process.status() == psutil.STATUS_IDLE:
is_running = True
except Exception as e:
print(f"Error in get_pid_from_cmd_line inner loop: {e}")
pass
if is_running:
ret_pids.append(pid.pid)
if break_on_first:
return ret_pids
except Exception as e:
# print(f"Error in get_pid_from_cmd_line inner loop: {e}")
continue
except Exception as e:
print(f"Error in get_pid_from_cmd_line outer loop: {e}")
pass

return ret_pids

Expand Down
Loading

0 comments on commit 98e084a

Please sign in to comment.