Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added template for data reader to pass conduit node from driver #2473

Draft
wants to merge 3 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/configure_files/LBANNConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ set(LBANN_HAS_DIHYDROGEN @LBANN_HAS_DIHYDROGEN@)
set(LBANN_HAS_DISTCONV @LBANN_HAS_DISTCONV@)
set(LBANN_HAS_DOXYGEN @LBANN_HAS_DOXYGEN@)
set(LBANN_HAS_EMBEDDED_PYTHON @LBANN_HAS_EMBEDDED_PYTHON@)
set(LBANN_HAS_FFTW @LBANN_HAS_FFTW@
set(LBANN_HAS_FFTW @LBANN_HAS_FFTW@)
set(LBANN_HAS_FFTW_FLOAT @LBANN_HAS_FFTW_FLOAT@)
set(LBANN_HAS_FFTW_DOUBLE @LBANN_HAS_FFTW_DOUBLE@)
set(LBANN_HAS_GPU_FP16 @LBANN_HAS_GPU_FP16@)
Expand Down
12 changes: 8 additions & 4 deletions core-driver/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
cmake_minimum_required(VERSION 3.18.0)
project(my_lbann_test C CXX)
cmake_minimum_required(VERSION 3.21.0)
project(lbann-test-driver CXX)
find_package(LBANN 0.102.0 REQUIRED)
add_executable(Main main.cpp)
target_link_libraries(Main PRIVATE LBANN::lbann)
add_executable(lbann-test-driver main.cpp)
target_link_libraries(lbann-test-driver PRIVATE LBANN::lbann)

set_target_properties(lbann-test-driver
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
107 changes: 93 additions & 14 deletions core-driver/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@
#include <mpi.h>
#include <stdio.h>

// Add test-specific options
void construct_opts(int argc, char **argv) {
auto& arg_parser = lbann::global_argument_parser();
lbann::construct_std_options();
lbann::construct_datastore_options();
arg_parser.add_option("samples",
{"-n"},
"Number of samples to run inference on",
Expand All @@ -52,20 +55,76 @@ void construct_opts(int argc, char **argv) {
"Number of labels in dataset",
10);
arg_parser.add_option("minibatchsize",
{"-mbs"},
{"--mbs"},
"Number of samples in a mini-batch",
16);
arg_parser.add_flag("use_conduit",
{"--conduit"},
"Use Conduit node samples (Default is non-distributed matrix)");
arg_parser.add_flag("use_dist_matrix",
{"--dist"},
"Use Hydrogen distributed matrix (Default is non-distributed matrix)");
arg_parser.add_required_argument<std::string>
("model",
"Directory containing checkpointed model");
arg_parser.parse(argc, argv);
}

El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
random_samples(El::Grid const& g, int n, int c, int h, int w) {
// Generates random samples and labels for mnist data in Hydrogen matrix
std::map<
std::string,
El::Matrix<float, El::Device::CPU>>
mat_mnist_samples(int n, int c, int h, int w)
{
El::Matrix<float, El::Device::CPU>
samples(c * h * w, n);
El::MakeUniform(samples);
El::Matrix<float, El::Device::CPU>
labels(1, n);
El::MakeUniform(labels);
std::map<
std::string,
El::Matrix<float, El::Device::CPU>>
samples_map = {{"data/samples", samples}, {"data/labels", labels}};
return samples_map;
}

// Generates random samples and labels for mnist data in Hydrogen distributed matrix
std::map<
std::string,
El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>>
distmat_mnist_samples(El::Grid const& g, int n, int c, int h, int w)
{
El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
samples(n, c * h * w, g);
samples(c * h * w, n, g);
El::MakeUniform(samples);
El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
labels(1, n, g);
El::MakeUniform(labels);
std::map<
std::string,
El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>>
samples_map = {{"data/samples", samples}, {"data/labels", labels}};
return samples_map;
}

// Fills array with random values
void random_fill(float *arr, int size, int max_val=255) {
for (int i; i < size; i++) {
arr[i] = (float)(std::rand() % max_val) / (float)max_val;
}
}

// Generates random samples and labels for mnist data in vector of Conduit nodes
std::vector<conduit::Node> conduit_mnist_samples(int n, int c, int h, int w) {
std::vector<conduit::Node> samples(n);
int sample_size = c * h * w;
float this_sample[sample_size];
for (int i; i<n; i++) {
random_fill(this_sample, sample_size);
samples[i]["data/samples"].set(this_sample, sample_size);
samples[i]["data/labels"] = std::rand() % 10;
}
return samples;
}

Expand All @@ -79,10 +138,13 @@ int main(int argc, char **argv) {
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);

// Get input arguments and print values
// Get input arguments, check and print values
construct_opts(argc, argv);
auto& arg_parser = lbann::global_argument_parser();
if (rank == 0) {
if (arg_parser.get<bool>("use_conduit") && arg_parser.get<bool>("use_dist_matrix")) {
LBANN_ERROR("Cannot use conduit node and distributed matrix together, choose one: --conduit --dist");
}
std::stringstream msg;
msg << "Model: " << arg_parser.get<std::string>("model") << std::endl;
msg << "{ N, c, h, w } = { " << arg_parser.get<int>("samples") << ", ";
Expand All @@ -94,8 +156,8 @@ int main(int argc, char **argv) {
std::cout << msg.str();
}

// Load model and run inference on samples
auto lbann_comm = lbann::initialize_lbann(MPI_COMM_WORLD);

auto m = lbann::load_inference_model(lbann_comm.get(),
arg_parser.get<std::string>("model"),
arg_parser.get<int>("minibatchsize"),
Expand All @@ -105,14 +167,31 @@ int main(int argc, char **argv) {
arg_parser.get<int>("width")
},
{arg_parser.get<int>("labels")});
auto samples = random_samples(lbann_comm->get_trainer_grid(),
arg_parser.get<int>("samples"),
arg_parser.get<int>("channels"),
arg_parser.get<int>("height"),
arg_parser.get<int>("width"));
auto labels = lbann::infer(m.get(),
samples,
arg_parser.get<int>("minibatchsize"));

// three options for data generation
if (arg_parser.get<bool>("use_conduit")) {
auto samples = conduit_mnist_samples(arg_parser.get<int>("samples"),
arg_parser.get<int>("channels"),
arg_parser.get<int>("height"),
arg_parser.get<int>("width"));
lbann::set_inference_samples(samples);
} else if (arg_parser.get<bool>("use_dist_matrix")) {
auto samples = distmat_mnist_samples(lbann_comm->get_trainer_grid(),
arg_parser.get<int>("samples"),
arg_parser.get<int>("channels"),
arg_parser.get<int>("height"),
arg_parser.get<int>("width"));
lbann::set_inference_samples(samples);
} else {
auto samples = mat_mnist_samples(
arg_parser.get<int>("samples"),
arg_parser.get<int>("channels"),
arg_parser.get<int>("height"),
arg_parser.get<int>("width"));
lbann::set_inference_samples(samples);
}

auto labels = lbann::inference(m.get());

// Print inference results
if (lbann_comm->am_world_master()) {
Expand Down
10 changes: 10 additions & 0 deletions core-driver/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
export AL_PROGRESS_RANKS_PER_NUMA_NODE=2
export OMP_NUM_THREADS=8
export MV2_USE_RDMA_CM=0

# This should be a checkpointed lenet model
MODEL_LOC="path/to/checkpointed/model"

./Main $MODEL_LOC
./Main $MODEL_LOC --dist
./Main $MODEL_LOC --conduit
1 change: 1 addition & 0 deletions include/lbann/data_ingestion/readers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ set_full_path(THIS_DIR_HEADERS
metadata.hpp
# Data readers
data_reader_cifar10.hpp
data_reader_conduit.hpp
data_reader_csv.hpp
data_reader_image.hpp
data_reader_HDF5.hpp
Expand Down
72 changes: 72 additions & 0 deletions include/lbann/data_ingestion/readers/data_reader_conduit.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2014-2021, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory.
// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
// the CONTRIBUTORS file. <[email protected]>
//
// LLNL-CODE-697807.
// All rights reserved.
//
// This file is part of LBANN: Livermore Big Artificial Neural Network
// Toolkit. For details, see http://software.llnl.gov/LBANN or
// https://github.com/LLNL/LBANN.
//
// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
// may not use this file except in compliance with the License. You may
// obtain a copy of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the license.
////////////////////////////////////////////////////////////////////////////////

#ifndef LBANN_DATA_READER_CONDUIT_HPP
#define LBANN_DATA_READER_CONDUIT_HPP

#include "lbann/data_readers/data_reader.hpp"
#include "lbann/data_store/data_store_conduit.hpp"

namespace lbann {
/**
* A generalized data reader for passed in conduit nodes.
*/
class conduit_data_reader : public generic_data_reader
{
public:
conduit_data_reader* copy() const override { return new conduit_data_reader(*this); }
bool has_conduit_output() override { return true; }
void load() override;
bool fetch_conduit_node(conduit::Node& sample, int data_id) override;

void set_data_dims(std::vector<int> dims);
void set_label_dims(std::vector<int> dims);

std::string get_type() const override { return "conduit_data_reader"; }
int get_linearized_data_size() const override {
int data_size = 1;
for(int i : m_data_dims) {
data_size *= i;
}
return data_size;
}
int get_linearized_label_size() const override {
int label_size = 1;
for(int i : m_label_dims) {
label_size *= i;
}
return label_size;
}

private:
std::vector<int> m_data_dims;
std::vector<int> m_label_dims;

}; // END: class conduit_data_reader

} // namespace lbann

#endif // LBANN_DATA_READER_CONDUIT_HPP
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ namespace lbann {
*
* This execution algorithm is meant for running inference using a trained
* model and samples passed by the user from an external application. The
* algorithm currently assumes that there is only 1 input layer in the model,
* and the output layer is a softmax layer.
* algorithm currently assumes that the output layer is a softmax layer.
*/
class batch_functional_inference_algorithm
{
Expand Down Expand Up @@ -73,111 +72,16 @@ class batch_functional_inference_algorithm

/** @brief Run model inference on samples and return predicted categories.
* @param[in] model A trained model
* @param[in] samples A distributed matrix containing samples for model input
* @param[in] mbs The max mini-batch size
* @return Matrix of predicted labels (by index)
*/
template <typename DataT,
El::Dist CDist,
El::Dist RDist,
El::DistWrap DistView,
El::Device Device>
El::Matrix<int, El::Device::CPU>
infer(observer_ptr<model> model,
El::DistMatrix<DataT, CDist, RDist, DistView, Device> const& samples,
size_t mbs)
{
if (mbs <= 0) {
LBANN_ERROR("mini-batch size must be larger than 0");
}

// Make matrix for returning predicted labels
size_t samples_size = samples.Height();
El::Matrix<int, El::Device::CPU> labels(samples_size, 1);

// BVE FIXME
// Create an SGD_execution_context so that layer.forward_prop can get the
// mini_batch_size - This should be fixed in the future, when SGD is not so
// hard-coded into the model & layers
auto c = SGDExecutionContext(execution_mode::inference);
model->reset_mode(c, execution_mode::inference);
// Explicitly set the size of the mini-batch that the model is executing
model->set_current_mini_batch_size(mbs);

// Infer on mini batches
for (size_t i = 0; i < samples_size; i += mbs) {
size_t mb_idx = std::min(i + mbs, samples_size);
auto mb_range = El::IR(i, mb_idx);
auto mb_samples = El::LockedView(samples, mb_range, El::ALL);
auto mb_labels = El::View(labels, mb_range, El::ALL);

infer_mini_batch(*model, mb_samples);
get_labels(*model, mb_labels);
}

return labels;
}
El::Matrix<El::Int, El::Device::CPU> infer(observer_ptr<model> model);

protected:
/** @brief Run model inference on a single mini-batch of samples
* This method takes a mini-batch of samples, inserts them into the input
* layer of the model, and runs forward prop on the model.
* @param[in] model A trained model
* @param[in] samples A distributed matrix containing samples for model input
*/
template <typename DataT,
El::Dist CDist,
El::Dist RDist,
El::DistWrap DistView,
El::Device Device>
void infer_mini_batch(
model& model,
El::DistMatrix<DataT, CDist, RDist, DistView, Device> const& samples)
{
for (int i = 0; i < model.get_num_layers(); i++) {
auto& l = model.get_layer(i);
// Insert samples into the input layer
if (l.get_type() == "input") {
auto& il = dynamic_cast<input_layer<DataType>&>(l);
il.set_samples(samples);
}
}
model.forward_prop(execution_mode::inference);
}

/** @brief Finds the predicted category in a models softmax layer
* @param[in] model A model that has been used for inference
* @param[in] labels A matrix to place predicted category labels
*/
void get_labels(model& model, El::Matrix<int, El::Device::CPU>& labels)
{
int pred_label = 0;
float max, col_value;

for (const auto* l : model.get_layers()) {
// Find the output layer
if (l->get_type() == "softmax") {
auto const& dtl =
dynamic_cast<lbann::data_type_layer<float> const&>(*l);
const auto& outputs = dtl.get_activations();

// Find the prediction for each sample
int col_count = outputs.Width();
int row_count = outputs.Height();
for (int i = 0; i < col_count; i++) {
max = 0;
for (int j = 0; j < row_count; j++) {
col_value = outputs.Get(i, j);
if (col_value > max) {
max = col_value;
pred_label = j;
}
}
labels(i) = pred_label;
}
}
}
}
void get_labels(model& model, El::Matrix<El::Int, El::Device::CPU>& labels);
};

} // namespace lbann
Expand Down
Loading
Loading