diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..8d11d90 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,4 @@ +python: + pip_install: true + extra_requirements: + - doc \ No newline at end of file diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..969fbe5 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,3 @@ +build +_static +_templates diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..74a3f72 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = SmartDispatch +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/source/autoresume.rst b/docs/source/autoresume.rst new file mode 100644 index 0000000..324cfd9 --- /dev/null +++ b/docs/source/autoresume.rst @@ -0,0 +1,14 @@ +==================== +Automatic resumption +==================== + +Oftentimes, there is a hard limit on maximum amount of time you can run your +job for on the cluster (we refer to it as **walltime**). Smart Dispatch allows you +to partially overcome that and run your tasks for longer periods. This is done +by enchancing generated PBS files with additional code that reschedules your +tasks as soon as they hit the walltime. The caveat here is that your tasks +**must be resumable**, i.e. be capable of restoring their state after being +killed and rerun. + +You can engage the autoresumption by passing ``-m`` or ``--autoresume`` during +``smart-dispatch`` execution. See :doc:`usage` for details. diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..afd2228 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- +# +# Smart Dispatch documentation build configuration file, created by +# sphinx-quickstart on Fri Feb 17 15:44:10 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys +sys.path.insert(0, os.path.abspath('../..')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.autoprogram'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Smart Dispatch' +copyright = u'2017, Stanislas Lauly, Marc-Alexandre Côté, Mathieu Germain' +author = u'Stanislas Lauly, Marc-Alexandre Côté, Mathieu Germain' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = u'2.0' +# The full version, including alpha/beta/rc tags. +release = u'2.0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'SmartDispatchdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'SmartDispatch.tex', u'Smart Dispatch Documentation', + u'Stanislas Lauly, Marc-Alexandre Côté, Mathieu Germain', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'smartdispatch', u'Smart Dispatch Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'SmartDispatch', u'Smart Dispatch Documentation', + author, 'SmartDispatch', 'One line description of project.', + 'Miscellaneous'), +] + + + diff --git a/docs/source/examples.rst b/docs/source/examples.rst new file mode 100644 index 0000000..fe2f575 --- /dev/null +++ b/docs/source/examples.rst @@ -0,0 +1,62 @@ +======== +Examples +======== + +Launch job +---------- + +:: + + smart-dispatch -q qtest@mp2 launch python my_script.py 2 80 tanh 0.1 + +Will launch ``python my_script.py 2 80 tanh 0.1`` on the queue ``qtest@mp2``. + + +Launch batch of jobs +-------------------- + +Automatically generate commands from combinations of arguments. :: + + smart-dispatch -q qtest@mp2 launch python my_script.py [1 2] 80 [tanh sigmoid] 0.1 + +Will generate 4 different commands and launch them on the queue ``qtest@mp2``: :: + + python my_script.py 1 80 sigmoid 0.1 + python my_script.py 1 80 tanh 0.1 + python my_script.py 2 80 sigmoid 0.1 + python my_script.py 2 80 tanh 0.1 + + +Another possiblility is to generate argument from a range. :: + + smart-dispatch -q qtest@mp2 launch python my_script.py [1:4] + +Will generate: :: + + python my_script.py 1 + python my_script.py 2 + python my_script.py 3 + +You can also add a step size to the range as the 3rd argument. :: + + smart-dispatch -q qtest@mp2 launch python my_script.py [1:10:2] + +Will generate: :: + + python my_script.py 1 + python my_script.py 3 + python my_script.py 5 + python my_script.py 7 + python my_script.py 9 + + +Resuming job/batch +------------------ + +:: + + smart-dispatch -q qtest@mp2 resume {batch_id} + +Jobs that did not terminate properly, for example, it exceeded the walltime, can be resumed using the ``{batch_id}`` given to you upon launch. Of course, all this assuming your script is resumable. + +*Note: Jobs are always in a batch, even if it's a batch of one.* diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..ded1882 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,40 @@ +Welcome to Smart Dispatch's documentation! +========================================== + +Smart Dispatch is an easy to use job launcher for supercomputers with PBS compatible job manager. + +Features +-------- + * Launch multiple jobs with a single line. + * Automatically generate combinations of arguments. See :doc:`examples`. + * Automatic resources management. Determine for you the optimal fit for your commands on nodes. + * Resume batch of commands. + * Easily manage logs. + * Advanced mode for complete control. + * Use automatic rescheduling of jobs that hit the walltime. See :doc:`autoresume`. + + +Installing +---------- + +Use ``pip`` package manager: :: + + pip install git+https://github.com/SMART-Lab/smartdispatch + + +Contents +-------- + +.. toctree:: + :maxdepth: 2 + + usage + examples + autoresume + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/docs/source/usage.rst b/docs/source/usage.rst new file mode 100644 index 0000000..49dd2fd --- /dev/null +++ b/docs/source/usage.rst @@ -0,0 +1,76 @@ +===== +Usage +===== + +.. autoprogram:: scripts/smart-dispatch:get_parser() + :prog: smart-dispatch + + +Hierarchy of generated files +---------------------------- + +In order to understand the contents of the generated folders/files, it's good to know how ``smart-dispatch`` deals with **commands** that a user requests to launch on the cluster: + +* Each invokation of ``smart-dispatch`` creates a so-called **batch** of **jobs**. Smart Dispatch will do its best to create as many simultaneous jobs so as to effecitvely utilze the allocated resources. +* Each job is basically a single PBS file that is run by the queue management system on the cluster (either ``msub`` or ``qsub``). +* A job spawns mulitple concurrent **workers** that all cooperate to execute the requested commands. +* Each worker (basically, a python script) is executing commands sequentially. + +A typical hierarchy of ``./SMART_DISPATCH_LOGS/{batch_id}/`` should look like this: :: + + commands/ + job_commands_0.sh + job_commands_1.sh + ... + commands.txt + running_commands.txt + failed_commands.txt + logs/ + job/ + 150472.gpu-srv1.helios.err + 150472.gpu-srv1.helios.out + ... + worker/ + 150472.gpu-srv1.helios_worker_0.e + 150472.gpu-srv1.helios_worker_0.o + 150472.gpu-srv1.helios_worker_1.e + 150472.gpu-srv1.helios_worker_1.o + ... + 4d501b8b9805796ee913153e2493d7069a8bfb1aa469a50279940752bf26c935.err + 4d501b8b9805796ee913153e2493d7069a8bfb1aa469a50279940752bf26c935.out + ... + command_line.log + jobs_id.txt + +The root directory contains two files: + +``command_line.log``: + A full command that was used to invoke ``smart-dispatch``. +``jobs_id.txt``: + A list of job IDs being run. + +Now let's go through the subdirectories. + + +``commands/`` +^^^^^^^^^^^^^ + +This directory holds generated PBS files (``job_commands_{pbs_index}.sh``) as well as three command lists: + +``commands.txt``: + A list pending commands (this is where the workers are taking their next commands to execute from). +``running_commands.txt``: + A list of currently running commands. +``failed_commands.txt``: + A list of failed commands. + + +``logs/`` +^^^^^^^^^ + +Output and error logs in are saved in this directory. The root level contains logs for actual commands. There are also two additional subfolder: + +``job/``: + Holds logs for the PBS files. +``worker/``: + Holds logs for workers. diff --git a/scripts/smart-dispatch b/scripts/smart-dispatch index 86904fa..004e6bb 100755 --- a/scripts/smart-dispatch +++ b/scripts/smart-dispatch @@ -185,7 +185,7 @@ def main(): print "\nLogs, command, and jobs id related to this batch will be in:\n {smartdispatch_folder}".format(smartdispatch_folder=path_job) -def parse_arguments(): +def get_parser(): parser = argparse.ArgumentParser() parser.add_argument('-q', '--queueName', required=True, help='Queue used (ex: qwork@mp2, qfat256@mp2, gpu_1)') parser.add_argument('-n', '--batchName', required=False, help='The name of the batch. Default: The commands launched.') @@ -215,6 +215,11 @@ def parse_arguments(): resume_parser.add_argument('--expandPool', type=int, nargs='?', const=sys.maxsize, help='Add workers to the given batch. Default: # pending jobs.') resume_parser.add_argument("batch_uid", help="Batch UID of the jobs to resume.") + return parser + + +def parse_arguments(): + parser = get_parser() args = parser.parse_args() # Check for invalid arguments in diff --git a/setup.py b/setup.py index 1da3ffe..cc69a84 100644 --- a/setup.py +++ b/setup.py @@ -15,5 +15,8 @@ description='An easy to use job launcher for supercomputers with PBS compatible job manager.', long_description=open('README.md').read(), install_requires=['psutil>=1'], + extras_require={ + 'doc': ['sphinxcontrib-autoprogram>=0.1.3'] + }, package_data={'smartdispatch': ['config/*.json']} ) diff --git a/smartdispatch.sublime-project b/smartdispatch.sublime-project new file mode 100644 index 0000000..24db303 --- /dev/null +++ b/smartdispatch.sublime-project @@ -0,0 +1,8 @@ +{ + "folders": + [ + { + "path": "." + } + ] +}