Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add configuration file for Cedar (#168) #173

Open
wants to merge 36 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
bd86e21
Added test for slurm integration
aalitaiga Jul 31, 2017
1168e3e
New test for priority
aalitaiga Aug 8, 2017
243b191
Added gres + memory tests
aalitaiga Aug 8, 2017
0a16243
Refactored tests
aalitaiga Aug 9, 2017
6f31489
small update
aalitaiga Aug 10, 2017
3b0dd0b
Python3 compatibility + PR comments
aalitaiga Sep 1, 2017
e8e6ec0
Fixed naccelerators issue
aalitaiga Sep 1, 2017
04f0ff1
Updated tests to skip on Graham and Cedar smartdispatch modified to h…
aalitaiga Sep 19, 2017
d7d0300
Cleaned code with PR feedback
aalitaiga Sep 26, 2017
c7bb250
Updated tests
aalitaiga Oct 6, 2017
a20405b
Updated tests using mock
aalitaiga Oct 10, 2017
5bde1c6
Refactor detect_cluster tests
bouthilx Oct 10, 2017
0ff4776
Small changes in TestSlurmQueue
aalitaiga Oct 10, 2017
d1ad338
Fix add_sbatch_option bug
bouthilx Oct 13, 2017
899167f
Refactor SlurmJobGenerator
bouthilx Oct 13, 2017
581f835
Remove queue name for Slurm clusters
bouthilx Oct 13, 2017
1d74e1a
Replace PBS_JOBID with SLURM_JOB_ID
bouthilx Oct 13, 2017
f00e877
Add PBS_FILENAME definition to pbs.prolog
bouthilx Oct 14, 2017
6b2d530
Fix env var export option for Slurm
bouthilx Oct 14, 2017
21df3dd
Adapt PBS_WALLTIME for slurm
bouthilx Oct 14, 2017
ea1d5b3
Add sbatch to command-line launcher options
bouthilx Oct 14, 2017
adb8cba
Make get_launcher more flexible
bouthilx Oct 14, 2017
f3661ba
Add verbosity to smart-dispatch
bouthilx Oct 14, 2017
972a1ab
Updated documentation for slurm clusters
aalitaiga Oct 15, 2017
29973b0
Add support for SlurmJobGenerator
bouthilx Oct 16, 2017
f734fb3
Print stderr when both qsub and sacctmgr fails
bouthilx Oct 16, 2017
4506887
Add automatic script for cluster verification
bouthilx Oct 16, 2017
02845e0
Add verification script for cedar
bouthilx Oct 16, 2017
2d6e6fd
Add verification script for graham
bouthilx Oct 16, 2017
f967180
Add verification script for mila
bouthilx Oct 16, 2017
8c655b4
Make get_launcher return None when no launcher
bouthilx Oct 16, 2017
998f3ba
Updated README
aalitaiga Oct 16, 2017
a3c08c8
Set properly account in verify_graham
bouthilx Oct 16, 2017
9fb5ab6
Set properly account in verify_cedar
bouthilx Oct 16, 2017
1dea0d8
Fix walltime_to_seconds convertion
bouthilx Oct 17, 2017
02823d8
Add configuration file for Cedar
bouthilx Oct 16, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Cleaned code with PR feedback
aalitaiga committed Sep 26, 2017
commit d7d03005ff83c3d4f0f0beac3bbc4c8cd74886c4
16 changes: 11 additions & 5 deletions smartdispatch/job_generator.py
Original file line number Diff line number Diff line change
@@ -78,7 +78,12 @@ def add_sbatch_flags(self, flags):

for flag in flags:
split = flag.find('=')
options[flag[:split]] = flag[split+1:]
if flag.startswith('--'):
options[flag[2:split]] = flag[split+1:]
elif flag.startswith('-'):
options[flag[1:split]] = flag[split+1:]
else:
raise ValueError("Invalid SBATCH flag ({})".format(flag))

for pbs in self.pbs_list:
pbs.add_sbatch_options(**options)
@@ -182,12 +187,13 @@ def _add_cluster_specific_rules(self):
# Remove forbidden ppn option. Default is 2 cores per gpu.
pbs.resources['nodes'] = re.sub(":ppn=[0-9]+", "", pbs.resources['nodes'])

class SlurmClusterGenerator(JobGenerator):
class SlurmJobGenerator(JobGenerator):

def _add_cluster_specific_rules(self):
for pbs in self.pbs_list:
node_resource = pbs.resources.pop('nodes')
gpus = re.match(".*gpus=([0-9]+)", node_resource).group(1)
ppn = re.match(".*ppn=([0-9]+)", node_resource).group(1)
gpus = re.match(".*gpus=([0-9]+)", pbs.resources['nodes']).group(1)
ppn = re.match(".*ppn=([0-9]+)", pbs.resources['nodes']).group(1)
pbs.resources['nodes'] = re.sub("ppn=[0-9]+", "", pbs.resources['nodes'])
pbs.resources['nodes'] = re.sub(":gpus=[0-9]+", "", pbs.resources['nodes'])
pbs.add_resources(naccelerators=gpus)
pbs.add_resources(ncpus=ppn)
6 changes: 5 additions & 1 deletion smartdispatch/pbs.py
Original file line number Diff line number Diff line change
@@ -70,10 +70,14 @@ def add_sbatch_options(self, **options):
Parameters
----------
**options : dict
each key is the name of a SBATCH option (see `Options`)
each key is the name of a SBATCH option
"""

for option_name, option_value in options.items():
if len(option_name) == 1:
self.sbatch_options["-" + option_name] = option_value
else:
self.sbatch_options["--" + option_name] = option_value
self.sbatch_options[option_name] = option_value

def add_resources(self, **resources):
3 changes: 2 additions & 1 deletion smartdispatch/tests/test_job_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from nose.tools import assert_true, assert_false, assert_equal, assert_raises
import unittest

import os
import tempfile
@@ -7,6 +8,7 @@
from smartdispatch.job_generator import JobGenerator, job_generator_factory
from smartdispatch.job_generator import HeliosJobGenerator, HadesJobGenerator
from smartdispatch.job_generator import GuilliminJobGenerator, MammouthJobGenerator
from smartdispatch.job_generator import SlurmJobGenerator


class TestJobGenerator(object):
@@ -242,7 +244,6 @@ def test_pbs_split_2_job_nb_commands(self):
assert_true("ppn=6" in str(self.pbs8[0]))
assert_true("ppn=2" in str(self.pbs8[1]))


class TestJobGeneratorFactory(object):

def setUp(self):
8 changes: 7 additions & 1 deletion smartdispatch/tests/test_pbs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from nose.tools import assert_true, assert_equal, assert_raises
from numpy.testing import assert_array_equal


from smartdispatch.pbs import PBS
import unittest
import tempfile
@@ -38,6 +37,13 @@ def test_add_options(self):
assert_equal(self.pbs.options["-A"], "option2")
assert_equal(self.pbs.options["-B"], "option3")

def test_add_sbatch_options(self):
self.pbs.add_sbatch_options(a="value1")
assert_equal(self.pbs.sbatch_options["-a"], "value1")
self.pbs.add_sbatch_options(option1="value2", option2="value3")
assert_equal(self.pbs.sbatch_options["--option1"], "value2")
assert_equal(self.pbs.sbatch_options["--option2"], "value3")

def test_add_resources(self):
assert_equal(len(self.pbs.resources), 1)
assert_equal(self.pbs.resources["walltime"], self.walltime)
Original file line number Diff line number Diff line change
@@ -2,9 +2,10 @@
import os
import time
import unittest

from subprocess import Popen, PIPE

from smartdispatch.utils import get_slurm_cluster_name

pbs_string = """\
#!/usr/bin/env /bin/bash

@@ -22,28 +23,8 @@
nvidia-smi
"""

sbatch_string = """\
#!/usr/bin/env -i /bin/zsh

#SBATCH --job-name=arrayJob
#SBATCH --output=arrayJob_%A_%a.out
#SBATCH --time=01:00:00
{}

######################
# Begin work section #
######################

echo "My SLURM_ARRAY_JOB_ID:" $SLURM_ARRAY_JOB_ID
echo "My SLURM_ARRAY_TASK_ID: " $SLURM_ARRAY_TASK_ID
nvidia-smi
"""

# Checking which cluster is running the tests first
process = Popen("sacctmgr list cluster", stdout=PIPE, stderr=PIPE, shell=True)
stdout, _ = process.communicate()
stdout = stdout.decode()
cluster = stdout.splitlines()[2].strip().split(' ')[0]
cluster = get_slurm_cluster_name()
to_skip = cluster in ['graham', 'cedar']
message = "Test does not run on cluster {}".format(cluster)

@@ -53,14 +34,14 @@ def tearDown(self):
for file_name in (glob('*.out') + ["test.pbs"]):
os.remove(file_name)

def _test_param(self, param_array, command, flag, string=pbs_string, output_array=None):
def _test_param(self, param_array, command_template, flag, string=pbs_string, output_array=None):
output_array = output_array or param_array
for param, output in zip(param_array, output_array):
com = pbs_string.format(
string.format(command.format(param))
param_command = pbs_string.format(
string.format(command_template.format(param))
)
with open("test.pbs", "w") as text_file:
text_file.write(com)
text_file.write(param_command)
process = Popen("sbatch test.pbs", stdout=PIPE, stderr=PIPE, shell=True)
stdout, _ = process.communicate()
stdout = stdout.decode()
7 changes: 6 additions & 1 deletion smartdispatch/utils.py
Original file line number Diff line number Diff line change
@@ -115,7 +115,7 @@ def detect_cluster():
output = Popen(["qstat", "-B"], stdout=PIPE).communicate()[0]
except OSError:
# If qstat is not available we assume that the cluster is unknown.
# TODO: handle MILA + CEDAR + GRAHAM
cluster_name = get_slurm_cluster_name()
return None
# Get server name from status
server_name = output.split('\n')[2].split(' ')[0]
@@ -131,6 +131,11 @@ def detect_cluster():
cluster_name = "hades"
return cluster_name

def get_slurm_cluster_name():
stdout = Popen("sacctmgr list cluster", stdout=PIPE, shell=True).communicate()[0]
stdout = stdout.decode()
cluster_name = stdout.splitlines()[2].strip().split(' ')[0]
return cluster_name

def get_launcher(cluster_name):
if cluster_name == "helios":