Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Checkpointing (quasi-Newton solver) #693

Merged
merged 27 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7a939b7
added notes and some drafty interface
cnpetra Aug 26, 2024
181f7f9
added draft of the api for checkpointing
cnpetra Aug 28, 2024
0e428f5
fixed compilation issues
cnpetra Aug 28, 2024
354b82b
integrated AXOM
cnpetra Aug 28, 2024
a14a445
added user options for checkpointing
cnpetra Aug 28, 2024
d497955
more work on load checkpoint EOD
cnpetra Sep 3, 2024
8b2342c
Merge branch 'develop' into chkpnt-dev
cnpetra Sep 4, 2024
d4900a9
semi-operation checkpointing
cnpetra Sep 4, 2024
3579828
removed save checkpoint callback from the interface
cnpetra Sep 4, 2024
d213774
fixed typos in comments
cnpetra Sep 4, 2024
4a9fbf1
moved sidre-related code from Algorithm class to a "utils" helper
cnpetra Sep 7, 2024
0498564
switched to refs; some testing of options-based checkpointing
cnpetra Sep 8, 2024
085eb88
added sidre copy to/from dense matrices
cnpetra Sep 11, 2024
b21c3c5
instrumentation for saving quasi-Newton internals to sidre
cnpetra Sep 11, 2024
bfe1b40
updated iteration counter to keep track of total number over restarts
cnpetra Sep 12, 2024
5bc2af6
updated doc; replace all #
cnpetra Sep 13, 2024
a664e35
added example on how to use checkpoint API
cnpetra Sep 13, 2024
261ccf9
clean up
cnpetra Sep 13, 2024
0506d38
added metadata
cnpetra Sep 14, 2024
b785475
testing and clean up
cnpetra Sep 14, 2024
adf30e6
Merge branch 'develop' into chkpnt-dev
cnpetra Sep 22, 2024
4e673d5
update user manual with checkpointing
cnpetra Sep 22, 2024
77c88a2
updated pdf user manual
cnpetra Sep 22, 2024
5631b6c
fix ci errors (compilation)
cnpetra Sep 23, 2024
ea5cafd
fix adtl compilation issues
cnpetra Sep 23, 2024
28c4567
fixed compil error
cnpetra Sep 23, 2024
27d15f2
addresed reviews
cnpetra Sep 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ option(HIOP_USE_EIGEN "Build with Eigen support" ON)
option(HIOP_USE_MPI "Build with MPI support" ON)
option(HIOP_USE_GPU "Build with support for GPUs - CUDA or HIP libraries" OFF)
option(HIOP_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF)
option(HIOP_USE_RAJA "Build with portability abstraction library RAJA" OFF)
option(HIOP_USE_RAJA "Build with portability abstraction library RAJA" OFF)
option(HIOP_USE_AXOM "Build with AXOM to use Sidre for scalable checkpointing" OFF)
option(HIOP_DEEPCHECKS "Extra checks and asserts in the code with a high penalty on performance" OFF)
option(HIOP_WITH_KRON_REDUCTION "Build Kron Reduction code (requires UMFPACK)" OFF)
option(HIOP_DEVELOPER_MODE "Build with extended warnings and options" OFF)
Expand Down Expand Up @@ -289,6 +290,15 @@ if(HIOP_USE_RAJA)
message(STATUS "Found umpire pkg-config: ${umpire_CONFIG}")
endif()

if(HIOP_USE_AXOM)
find_package(AXOM CONFIG
PATHS ${AXOM_DIR} ${AXOM_DIR}/lib/cmake/
REQUIRED)
target_link_libraries(hiop_tpl INTERFACE axom)
message(STATUS "Found AXOM pkg-config: ${AXOM_CONFIG}")
endif()


cnpetra marked this conversation as resolved.
Show resolved Hide resolved
if(HIOP_WITH_KRON_REDUCTION)
set(HIOP_UMFPACK_DIR CACHE PATH "Path to UMFPACK directory")
include(FindUMFPACK)
Expand Down
6 changes: 3 additions & 3 deletions src/Interface/hiopInterface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,8 +467,8 @@ class hiopInterfaceBase
}

/**
* This method is used to provide an user all the hiop iterate
* procedure. @see solution_callback() for an explanation of the parameters.
* This method is used to provide user all the internal hiop iterates. @see solution_callback()
* for an explanation of the parameters.
*
* @param[in] x array of (local) entries of the primal variables (managed by Umpire, see note below)
* @param[in] z_L array of (local) entries of the dual variables for lower bounds (managed by Umpire, see note below)
Expand Down Expand Up @@ -496,7 +496,7 @@ class hiopInterfaceBase
{
return true;
}

/**
* A wildcard function used to change the primal variables.
*
Expand Down
1 change: 1 addition & 0 deletions src/Interface/hiop_defs.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#cmakedefine HIOP_USE_PARDISO
#cmakedefine HIOP_USE_RESOLVE
#cmakedefine HIOP_USE_GINKGO
#cmakedefine HIOP_USE_AXOM
#define HIOP_VERSION "@PROJECT_VERSION@"
#define HIOP_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
#define HIOP_VERSION_MINOR @PROJECT_VERSION_MINOR@
Expand Down
212 changes: 210 additions & 2 deletions src/Optimization/hiopAlgFilterIPM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,21 @@

#include "hiopCppStdUtils.hpp"

#ifdef HIOP_USE_AXOM
#include <axom/sidre/core/DataStore.hpp>
#include <axom/sidre/core/Group.hpp>
#include <axom/sidre/core/View.hpp>
#include <axom/sidre/spio/IOManager.hpp>
using namespace axom;
#endif

#include <cmath>
#include <cstring>
#include <cassert>
#include <stdio.h>
#include <ctype.h>
#include <exception>
#include <sstream>

namespace hiop
{
Expand Down Expand Up @@ -976,8 +986,37 @@ hiopSolveStatus hiopAlgFilterIPMQuasiNewton::run()

nlp->runStats.tmOptimizTotal.start();

startingProcedure(*it_curr, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d); //this also evaluates the nlp
_mu=mu0;
//
// starting point:
// - user provided (with slack adjustments and lsq eq. duals initialization
// or
// - loaded checkpoint
//
if(nlp->options->GetString("checkpoint_load_on_start") != "yes") {
//this also evaluates the nlp
startingProcedure(*it_curr, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d);
_mu=mu0;
} else {
//
//checkpoint load
//
//load from file: will populate it_curr, _Hess_lagr, and algorithmic parameters
auto chkpnt_ok = load_state_from_file(nlp->options->GetString("checkpoint_file"));
if(chkpnt_ok) {
//additionally: need to evaluate the nlp
if(!this->evalNlp_noHess(*it_curr, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d)) {
nlp->log->printf(hovError, "Failure in evaluating user NLP functions at loaded checkpoint.");
return Error_In_User_Function;
}
} else {
nlp->log->printf(hovWarning, "Using default starting procedure (no checkpoint load!).\n");
//this also evaluates the nlp
startingProcedure(*it_curr, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d);
_mu=mu0;
}

solver_status_ = NlpSolve_SolveNotCalled;
}

//update log bar
logbar->updateWithNlpInfo(*it_curr, _mu, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d);
Expand Down Expand Up @@ -1095,6 +1134,11 @@ hiopSolveStatus hiopAlgFilterIPMQuasiNewton::run()
solver_status_ = User_Stopped; break;
}

#ifdef HIOP_USE_AXOM
//checkpointing - based on options provided by the user
checkpointing_stuff();
#endif

/*************************************************
* Termination check
************************************************/
Expand Down Expand Up @@ -1485,6 +1529,170 @@ void hiopAlgFilterIPMQuasiNewton::outputIteration(int lsStatus, int lsNum, int u
}
}

#ifdef HIOP_USE_AXOM

bool hiopAlgFilterIPMQuasiNewton::save_state_to_file(const ::std::string& path) noexcept
{
auto success = true;
sidre::DataStore* ds = nullptr;
try {
ds = new sidre::DataStore();
this->save_state_to_data_store(ds);

sidre::IOManager writer(this->get_nlp()->get_comm());
int n_files;
MPI_Comm_size(this->get_nlp()->get_comm(), &n_files);
writer.write(ds->getRoot(), n_files, path.c_str(), sidre::Group::getDefaultIOProtocol());
} catch(const std::exception& exp) {
nlp->log->printf(hovError, "Error in saving checkpoint to file '%s'\n", path.c_str());
nlp->log->printf(hovError, " Addtl info: %s\n", exp.what());
success = false;
}
delete ds;
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
return success;
}

bool hiopAlgFilterIPMQuasiNewton::load_state_from_file(const ::std::string& path) noexcept
{
sidre::DataStore* ds = nullptr;
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
auto success = true;
try {
ds = new sidre::DataStore();
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
sidre::IOManager reader(this->get_nlp()->get_comm());
reader.read(ds->getRoot(), path, false);

this->load_state_from_data_store(ds);

} catch(const std::exception& exp) {
nlp->log->printf(hovError, "Error in loading checkpoint from file '%s'\n", path.c_str());
nlp->log->printf(hovError, " Addtl info: %s\n", exp.what());
success = false;
}
delete ds;
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
return success;
}

void hiopAlgFilterIPMQuasiNewton::
copy_vec_to_new_view(const ::std::string& name, const hiopVector* vec, sidre::Group* nlp_group)
{
using IndType = sidre::IndexType;
const IndType size = vec->get_local_size();
sidre::View* dest = nlp_group->createViewAndAllocate(name, sidre::DOUBLE_ID, size);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In LiDO, we often use the same DataStore for the whole run. If we were to call this multiple times, I think it will fail because the views would already exist.
Right now this method always makes a new view. It would be better if it checked first if the view already exists. If it already exists in the Group, use that one. If it doesn't already exist, create a new view.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

last commit does that. note that an exception will be thrown if the view exists and has a different number of elements than HiOp's state variable.


const auto stride(dest->getStride());
double *const dest_ptr(dest->getArray());
const double* arr = vec->local_data_host_const();
if(1==stride) {
::std::copy(arr, arr+size, dest_ptr);
} else {
for(IndType i=0; i<size; ++i) {
dest_ptr[i*stride] = arr[i];
}
}
}
void hiopAlgFilterIPMQuasiNewton::
copy_vec_from_view(const ::std::string& name, hiopVector* vec, const sidre::Group* nlp_group)
{
const sidre::View* view_const = nlp_group->getView(name);
if(!view_const) {
::std::stringstream ss;
ss << "Could not find view '" << name << "in sidre::Group.\n";
throw ::std::runtime_error(ss.str());
}
// const_cast becase View does not have a const getArray()
sidre::View* view = const_cast<sidre::View*>(view_const);
if(view) {
const hiop::size_type size = vec->get_local_size();
if(view->getNumElements() != size) {
::std::stringstream ss;
ss << "Size mismatch for state/view '" << name << "' between hiop and sidre::View: " <<
" HiOp state is " << size << " doubles, while the view has " << view->getNumElements() <<
" double elements.\n";
throw ::std::runtime_error(ss.str());
}
double* arr_dest = vec->local_data_host();
const double *const arr_src = view->getArray();
const auto stride(view->getStride());
if(1==stride) {
::std::copy(arr_src, arr_src+size, arr_dest);
} else {
for(hiop::index_type i=0; i<size; ++i) {
arr_dest[i] = arr_src[i*stride];
}
}
} else {
assert(false && "const cast failed for Sidre::View");
}
}

void hiopAlgFilterIPMQuasiNewton::save_state_to_data_store(::axom::sidre::DataStore* ds)
{
using IndType = sidre::IndexType;
sidre::Group* nlp_group = ds->getRoot()->createGroup("hiop solver");

//create views for each member that needs to be saved
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
copy_vec_to_new_view("x", it_curr->get_x(), nlp_group);
copy_vec_to_new_view("d", it_curr->get_d(), nlp_group);
copy_vec_to_new_view("sxl", it_curr->get_sxl(), nlp_group);
copy_vec_to_new_view("sxu", it_curr->get_sxu(), nlp_group);
copy_vec_to_new_view("sdl", it_curr->get_sdl(), nlp_group);
copy_vec_to_new_view("sdu", it_curr->get_sdu(), nlp_group);
copy_vec_to_new_view("yc", it_curr->get_yc(), nlp_group);
copy_vec_to_new_view("zl", it_curr->get_zl(), nlp_group);
copy_vec_to_new_view("zu", it_curr->get_zu(), nlp_group);
copy_vec_to_new_view("vl", it_curr->get_vl(), nlp_group);
copy_vec_to_new_view("vu", it_curr->get_vu(), nlp_group);

cnpetra marked this conversation as resolved.
Show resolved Hide resolved
//quasi-Newton Hessian approximation

//algorithmic parameters for this state
//mu, iteration number
const double alg_params[] = {_mu};
cnpetra marked this conversation as resolved.
Show resolved Hide resolved

}

void hiopAlgFilterIPMQuasiNewton::load_state_from_data_store(const sidre::DataStore* ds)
{
const sidre::Group* nlp_group = ds->getRoot()->getGroup("hiop solver");

copy_vec_from_view("x", it_curr->get_x(), nlp_group);
copy_vec_from_view("d", it_curr->get_d(), nlp_group);
copy_vec_from_view("sxl", it_curr->get_sxl(), nlp_group);
copy_vec_from_view("sxu", it_curr->get_sxu(), nlp_group);
copy_vec_from_view("sdl", it_curr->get_sdl(), nlp_group);
copy_vec_from_view("sdu", it_curr->get_sdu(), nlp_group);
copy_vec_from_view("yc", it_curr->get_yc(), nlp_group);
copy_vec_from_view("zl", it_curr->get_zl(), nlp_group);
copy_vec_from_view("zu", it_curr->get_zu(), nlp_group);
copy_vec_from_view("vl", it_curr->get_vl(), nlp_group);
copy_vec_from_view("vu", it_curr->get_vu(), nlp_group);

cnpetra marked this conversation as resolved.
Show resolved Hide resolved

}

void hiopAlgFilterIPMQuasiNewton::checkpointing_stuff()
Copy link
Collaborator

@nychiang nychiang Sep 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style Guidelines.
There should be spaces before and after each operator, e.g. line 1675, 1655,...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style Guidelines. There should be spaces before and after each operator, e.g. line 1675, 1680,...

can you be more specific about the guideline?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean space before and after operator "==".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our CMake build that uses BLT auto formats everything using clang-format (and a configuration file) with a make style. Also make check verifies that the code matches the clang-format style configuration, so a PR can't be merged without complying with the style.

{
if(nlp->options->GetString("checkpoint_save")=="no") {
return;
}
int chk_every_N = nlp->options->GetInteger("checkpoint_save_every_N_iter");
//check iteration
if(iter_num>0 && iter_num % chk_every_N==0) {
using ::std::string;
// replace "#" in checkpointing file with iteration number
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
string path = nlp->options->GetString("checkpoint_file");
auto pos = path.find("#");
if(string::npos != pos) {
auto s_it_num = ::std::to_string(iter_num);
path.replace(pos, 1, s_it_num);
}

nlp->log->printf(hovSummary, "Saving checkpoint at iter %d in '%s'.\n", iter_num, path.c_str());
//actual checkpointing via axom::sidre
save_state_to_file(path);
}
}
#endif // HIOP_USE_AXOM

/******************************************************************************************************
* FULL NEWTON IPM
Expand Down
74 changes: 73 additions & 1 deletion src/Optimization/hiopAlgFilterIPM.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,15 @@
#include "hiopPDPerturbation.hpp"
#include "hiopFactAcceptor.hpp"

#ifdef HIOP_USE_AXOM
#include <axom/sidre/core/DataStore.hpp>
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
#endif

#include "hiopTimer.hpp"

namespace hiop
{

cnpetra marked this conversation as resolved.
Show resolved Hide resolved
class hiopAlgFilterIPMBase {
public:
hiopAlgFilterIPMBase(hiopNlpFormulation* nlp_, const bool within_FR = false);
Expand Down Expand Up @@ -117,6 +121,8 @@ class hiopAlgFilterIPMBase {
{
return filter.contains(theta, logbar_obj);
}

/// Setter for the primal steplength.
inline void set_alpha_primal(const double alpha_primal) { _alpha_primal = alpha_primal; }

protected:
Expand Down Expand Up @@ -339,8 +345,74 @@ class hiopAlgFilterIPMQuasiNewton : public hiopAlgFilterIPMBase
virtual ~hiopAlgFilterIPMQuasiNewton();

virtual hiopSolveStatus run();

// note that checkpointing is only available with a axom-enabled build
#ifdef HIOP_USE_AXOM
/**
* @brief Save state of HiOp algorithm to a sidre data store.
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @param data_store a pointer to DataStore
*
* @details
* A new sidre::group "hiop solver" is created within data_store. Then a sidre::View
* is created within the group for each of the algorithm's states and the states are
* copied in the corresponding views.
*/
virtual void save_state_to_data_store(::axom::sidre::DataStore* data_store);
cnpetra marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Load state of HiOp algorithm from a sidre data store.
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @param data_store a pointer to DataStore
*
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @details
* Copies views from the data store sidre::group (named "hiop solver") to HiOp algorithm's
* state variables. The group should be created by first calling save_state_to_data_store
* for a problem/NLP of the same sizes as the problem for which load_state_from_data_store
* is called. This ensures the views have the names and sizes expected by this method.
* Otherwise a std::runtime_error exception is thrown.
*/
virtual void load_state_from_data_store(const ::axom::sidre::DataStore* data_store);
cnpetra marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Save the state of the algorithm to the file
* @param path the name of the file
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @return true if successful, false otherwise
*
* @details
* Internally, HiOp uses axom::sidre::DataStore and sidre's scalable IO. A detailed
* error description is sent to the log if an error or exception is caught.
*/
bool save_state_to_file(const ::std::string& path) noexcept;

/**
* @brief load the state of the algorithm from file
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @param path the name of the file to load from
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @return true if successful, false otherwise
*
* @details
* The file should contains a axom::sidre::DataStore that was previously saved using
* save_state_to_file(). A detailed error description is sent to the log if an error
* or exception is caught.
*/
bool load_state_from_file(const ::std::string& path) noexcept;
#endif // HIOP_USE_AXOM
private:
virtual void outputIteration(int lsStatus, int lsNum, int use_soc = 0, int use_fr = 0);

#ifdef HIOP_USE_AXOM
///@brief The options-based logic for saving checkpoint and the call to save_state().
void checkpointing_stuff();

/**
* @brief Copy HiOp vector to a (new) axom::sidre::View.
*
* @details A new view is created/allocated within the sidre group.
*/
void copy_vec_to_new_view(const ::std::string& name, const hiopVector* vec, ::axom::sidre::Group* nlp_group);

/// Copy content of the named sidre view from nlp_group into HiOp Vector.
void copy_vec_from_view(const ::std::string& name, hiopVector* vec, const axom::sidre::Group* nlp_group);
#endif // HIOP_USE_AXOM

private:
hiopNlpDenseConstraints* nlpdc;
private:
Expand Down
Loading
Loading