Skip to content

Commit

Permalink
Fix errandboy exiting issue (#11)
Browse files Browse the repository at this point in the history
Register the close() method for calling at exit.
  • Loading branch information
aidangomez authored Nov 9, 2018
1 parent c1bceec commit 087a66c
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 17 deletions.
1 change: 1 addition & 0 deletions cloud/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from cloud.envs.gcp import TPU
from cloud.envs.gcp import TPUManager

from cloud.cloud import close
from cloud.cloud import connect
from cloud.cloud import down
from cloud.cloud import delete
12 changes: 12 additions & 0 deletions cloud/cloud.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import atexit
import libcloud
import logging
import toml

import cloud
from cloud import registry as reg
from cloud import Instance
from cloud.envs import utils

logger = logging.getLogger(__name__)


def connect():
with open(utils.config_path(), "r") as cf:
Expand All @@ -14,9 +18,17 @@ def connect():
cloud.instance = reg.retrieve(provider, config=config)


def close():
utils.kill_transport()
utils.kill_server()


def down():
cloud.instance.down()


def delete(confirm=True):
cloud.instance.delete(confirm)


atexit.register(close)
19 changes: 10 additions & 9 deletions cloud/envs/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ def usable(self):
def up(self, async=False):
raise NotImplementedError

def down(self, async=False):
def down(self, async=True):
raise NotImplementedError

def delete(self, async=False):
def delete(self, async=True):
if self.manager:
self.manager.remove(self)

Expand All @@ -42,9 +42,7 @@ def __init__(self, manager=None, **kwargs):
assert utils.get_server().is_alive()

def _kill_command_server(self):
logger.warn("Killing transport")
utils.kill_transport()
logger.warn("Killing server")
utils.kill_server()

@property
Expand Down Expand Up @@ -72,17 +70,20 @@ def node(self):

return self._node

def down(self, async=False, delete_resources=True):
def clean(self, async=True, delete_resources=True):
for rm in self.resource_managers:
if delete_resources:
rm.delete(async=async)
else:
rm.down(async=async)

self._kill_command_server()

def down(self, async=True, delete_resources=True):
self.clean(async=async, delete_resources=delete_resources)
self.driver.ex_stop_node(self.node)

def delete(self, async=False, confirm=True):
def delete(self, async=True, confirm=True):
while confirm:
r = input("Are you sure you wish to delete this instance (y/[n]): ")

Expand All @@ -94,7 +95,7 @@ def delete(self, async=False, confirm=True):

super().delete(async=async)

self._kill_command_server()
self.clean(async=async, delete_resources=True)
self.driver.destroy_node(self.node, destroy_boot_disk=True)


Expand Down Expand Up @@ -130,15 +131,15 @@ def remove(self, *args, **kwargs):
def up(self, async=False):
raise NotImplementedError

def down(self, async=False):
def down(self, async=True):
for r in self.resources:
try:
r.down(async=async)
except Exception as e:
logger.error("Failed to shutdown resource: %s" % r)
logger.error(traceback.format_exc())

def delete(self, async=False):
def delete(self, async=True):
for r in self.resources:
try:
r.delete(async=async)
Expand Down
6 changes: 3 additions & 3 deletions cloud/envs/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,15 @@ def up(self, async=False):

utils.try_call(cmd)

def down(self, async=False):
def down(self, async=True):
cmd = ["gcloud", "alpha", "compute", "tpus", "stop", self.name]
if async:
cmd += ["--async"]

utils.try_call(cmd)

def delete(self, async=False):
super().delete()
def delete(self, async=True):
super().delete(async=async)

cmd = ["gcloud", "alpha", "compute", "tpus", "delete", self.name]
if async:
Expand Down
9 changes: 5 additions & 4 deletions cloud/envs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from errand_boy.transports.unixsocket import UNIXSocketTransport
from errand_boy.run import main as eb_main

logger = logging.getLogger(__name__)

EB_TRANSPORT = None
EB_SERVER = None

Expand All @@ -23,6 +25,7 @@ def kill_transport():
if EB_TRANSPORT is None:
return

logger.warn("Killing transport")
del EB_TRANSPORT
EB_TRANSPORT = None

Expand All @@ -47,17 +50,15 @@ def kill_server():
if EB_SERVER is None:
return

logger.warn("Killing server")
if EB_SERVER.is_alive():
EB_SERVER.terminate()
time.sleep(0.5)
EB_SERVER.join(timeout=5)
EB_SERVER.join(timeout=1)
del EB_SERVER
EB_SERVER = None


logger = logging.getLogger(__name__)


def call(cmd):
if isinstance(cmd, list):
cmd = " ".join(cmd)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

setup(
name='dl-cloud',
version='0.0.3',
version='0.0.4',
description='Cloud resource management for deep learning applications.',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down

0 comments on commit 087a66c

Please sign in to comment.