Skip to content

Commit

Permalink
Merge pull request #117 from dinal/horovoc_cpu
Browse files Browse the repository at this point in the history
add horovod cpu and change dir location
  • Loading branch information
adiso75 authored Sep 15, 2019
2 parents ffdbc3a + 87b6961 commit dae664e
Show file tree
Hide file tree
Showing 10 changed files with 1,246 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade git+https://github.com/fchollet/keras\n",
"!pip install --upgrade tensorflow \n",
"!pip install --upgrade numpy"
"!pip install --upgrade keras==2.2.4\n",
"!pip install --upgrade tensorflow==1.13.1 \n",
"!pip install --upgrade 'numpy<1.15.0'"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"!pip install git+https://github.com/v3io/v3io-gputils"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"!rm -f /User/demos/horovod/cpu/image-classification/cats_dogs.hd5\n",
"!mkdir /User/demos/horovod/cpu/image-classification/checkpoints"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"HOROVOD_JOB_NAME = \"horovod-cats-n-dogs\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'apiVersion': 'kubeflow.org/v1alpha1',\n",
" 'kind': 'MPIJob',\n",
" 'metadata': {'creationTimestamp': '2019-07-02T07:41:59Z',\n",
" 'generation': 1,\n",
" 'name': 'horovod-cats-n-dogs',\n",
" 'namespace': 'default-tenant',\n",
" 'resourceVersion': '1391131',\n",
" 'selfLink': '/apis/kubeflow.org/v1alpha1/namespaces/default-tenant/mpijobs/horovod-cats-n-dogs',\n",
" 'uid': 'df9b08a1-9c9c-11e9-98d3-d8c4972b0204'},\n",
" 'spec': {'replicas': 8,\n",
" 'template': {'spec': {'containers': [{'command': ['mpirun',\n",
" 'python',\n",
" '/User/demos/horovod/cpu/image-classification/horovod_train_cats_n_dogs.py',\n",
" '/User/demos/horovod/cpu/image-classification/cats_and_dogs_filtered',\n",
" '/User/demos/horovod/cpu/image-classification'],\n",
" 'image': 'iguaziodocker/horovod-cpu:0.0.1',\n",
" 'name': 'horovod-cats-n-dogs',\n",
" 'resources': {'limits': {'nvidia.com/gpu': 0}},\n",
" 'securityContext': {'capabilities': {'add': ['IPC_LOCK']}},\n",
" 'volumeMounts': [{'mountPath': '/User',\n",
" 'name': 'v3io'}]}],\n",
" 'volumes': [{'flexVolume': {'driver': 'v3io/fuse',\n",
" 'options': {'accessKey': '1e52ff93-a541-4880-abf1-d9b948af77de',\n",
" 'container': 'users',\n",
" 'subPath': '/iguazio'}},\n",
" 'name': 'v3io'}]}}}}\n"
]
}
],
"source": [
"from v3io_gputils.mpijob import MpiJob\n",
"\n",
"job = MpiJob(HOROVOD_JOB_NAME, 'iguaziodocker/horovod-cpu:0.0.1', ['/User/demos/horovod/cpu/image-classification/horovod_train_cats_n_dogs.py',\n",
" '/User/demos/horovod/cpu/image-classification/cats_and_dogs_filtered',\n",
" '/User/demos/horovod/cpu/image-classification'])\n",
"\n",
"job.replicas(2).gpus(0)\n",
"job.submit()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"horovod-cats-n-dogs-launcher-8kg8c 1/1 Running 0 75s\n",
"horovod-cats-n-dogs-worker-0 1/1 Running 0 83s\n",
"horovod-cats-n-dogs-worker-1 1/1 Running 0 83s\n",
"horovod-cats-n-dogs-worker-2 1/1 Running 0 83s\n",
"horovod-cats-n-dogs-worker-3 1/1 Running 0 83s\n",
"horovod-cats-n-dogs-worker-4 1/1 Running 0 83s\n",
"horovod-cats-n-dogs-worker-5 1/1 Running 0 83s\n",
"horovod-cats-n-dogs-worker-6 1/1 Running 0 83s\n",
"horovod-cats-n-dogs-worker-7 1/1 Running 0 83s\n"
]
}
],
"source": [
"\n",
"!kubectl get pods | grep $HOROVOD_JOB_NAME"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"apiVersion: kubeflow.org/v1alpha1\n",
"kind: MPIJob\n",
"metadata:\n",
" creationTimestamp: 2019-07-02T06:49:07Z\n",
" generation: 1\n",
" name: horovod-cats-n-dogs\n",
" namespace: default-tenant\n",
" resourceVersion: \"1386982\"\n",
" selfLink: /apis/kubeflow.org/v1alpha1/namespaces/default-tenant/mpijobs/horovod-cats-n-dogs\n",
" uid: 7d0cd80c-9c95-11e9-98d3-d8c4972b0204\n",
"spec:\n",
" backoffLimit: 6\n",
" replicas: 8\n",
" template:\n",
" metadata:\n",
" creationTimestamp: null\n",
" spec:\n",
" containers:\n",
" - command:\n",
" - mpirun\n",
" - python\n",
" - /User/demos/horovod/cpu/image-classification/horovod_train_cats_n_dogs.py\n",
" - /User/demos/horovod/cpu/image-classification/cats_and_dogs_filtered\n",
" - /User/demos/horovod/cpu/image-classification\n",
" image: iguaziodocker/cpu/horovod-cpu:0.1.1\n",
" name: horovod-cats-n-dogs\n",
" resources:\n",
" limits:\n",
" nvidia.com/gpu: \"1\"\n",
" securityContext:\n",
" capabilities:\n",
" add:\n",
" - IPC_LOCK\n",
" volumeMounts:\n",
" - mountPath: /User\n",
" name: v3io\n",
" volumes:\n",
" - flexVolume:\n",
" driver: v3io/fuse\n",
" options:\n",
" accessKey: 1e52ff93-a541-4880-abf1-d9b948af77de\n",
" container: users\n",
" subPath: /iguazio\n",
" name: v3io\n",
"status:\n",
" completionTime: 2019-07-02T06:56:20Z\n",
" launcherStatus: Succeeded\n",
" startTime: 2019-07-02T06:49:14Z\n"
]
}
],
"source": [
"!kubectl get mpijob $HOROVOD_JOB_NAME -o yaml"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'apiVersion': 'v1',\n",
" 'details': {'group': 'kubeflow.org',\n",
" 'kind': 'mpijobs',\n",
" 'name': 'horovod-cats-n-dogs',\n",
" 'uid': '1b58dd58-9c97-11e9-98d3-d8c4972b0204'},\n",
" 'kind': 'Status',\n",
" 'metadata': {},\n",
" 'status': 'Success'}\n"
]
}
],
"source": [
"job.delete()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
],
"source": [
"%nuclio env -c MODEL_PATH=/model/\n",
"%nuclio env -l MODEL_PATH=/User/demos/gpu/horovod/image-classification/cats_dogs.hd5\n",
"%nuclio env -l MODEL_PATH=/User/demos/horovod/cpu/image-classification/cats_dogs.hd5\n",
"%nuclio env -l PREDICTION_MAP_PATH=./model/prediction_classes_map.json"
]
},
Expand All @@ -87,9 +87,9 @@
"outputs": [],
"source": [
"%%nuclio cmd -c\n",
"pip install git+https://github.com/fchollet/keras\n",
"pip install tensorflow \n",
"pip install numpy\n",
"pip install keras==2.2.4\n",
"pip install tensorflow==1.13.1 \n",
"pip install 'numpy<1.15.0'\n",
"pip install requests\n",
"pip install pillow"
]
Expand Down Expand Up @@ -121,12 +121,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"mounting volume path /model as ~/demos/gpu/horovod/image-classification/cats_dogs/model\n"
"mounting volume path /model as ~/demos/horovod/cpu/image-classification/cats_dogs/model\n"
]
}
],
"source": [
"%nuclio mount /model ~/demos/gpu/horovod/image-classification/cats_dogs/model"
"%nuclio mount /model ~/demos/horovod/cpu/image-classification/cats_dogs/model"
]
},
{
Expand Down Expand Up @@ -377,7 +377,7 @@
" options:\n",
" accessKey: 1e52ff93-a541-4880-abf1-d9b948af77de\n",
" container: users\n",
" subPath: /iguazio/demos/gpu/horovod/image-classification/cats_dogs/model\n",
" subPath: /iguazio/demos/horovod/cpu/image-classification/cats_dogs/model\n",
" name: fs\n",
" volumeMount:\n",
" mountPath: /model\n",
Expand Down
Loading

0 comments on commit dae664e

Please sign in to comment.