LLNL · bvanessen · Nov 1, 2021 · Sep 24, 2024 · Sep 24, 2024
diff --git a/cmake/configure_files/LBANNConfig.cmake.in b/cmake/configure_files/LBANNConfig.cmake.in
@@ -74,7 +74,7 @@ set(LBANN_HAS_DIHYDROGEN @LBANN_HAS_DIHYDROGEN@)
 set(LBANN_HAS_DISTCONV @LBANN_HAS_DISTCONV@)
 set(LBANN_HAS_DOXYGEN @LBANN_HAS_DOXYGEN@)
 set(LBANN_HAS_EMBEDDED_PYTHON @LBANN_HAS_EMBEDDED_PYTHON@)
-set(LBANN_HAS_FFTW @LBANN_HAS_FFTW@
+set(LBANN_HAS_FFTW @LBANN_HAS_FFTW@)
 set(LBANN_HAS_FFTW_FLOAT @LBANN_HAS_FFTW_FLOAT@)
 set(LBANN_HAS_FFTW_DOUBLE @LBANN_HAS_FFTW_DOUBLE@)
 set(LBANN_HAS_GPU_FP16 @LBANN_HAS_GPU_FP16@)

diff --git a/core-driver/CMakeLists.txt b/core-driver/CMakeLists.txt
@@ -1,5 +1,9 @@
-cmake_minimum_required(VERSION 3.18.0)
-project(my_lbann_test C CXX)
+cmake_minimum_required(VERSION 3.21.0)
+project(lbann-test-driver CXX)
 find_package(LBANN 0.102.0 REQUIRED)
-add_executable(Main main.cpp)
-target_link_libraries(Main PRIVATE LBANN::lbann)
+add_executable(lbann-test-driver main.cpp)
+target_link_libraries(lbann-test-driver PRIVATE LBANN::lbann)
+
+set_target_properties(lbann-test-driver
+  PROPERTIES
+  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
diff --git a/core-driver/main.cpp b/core-driver/main.cpp
@@ -29,8 +29,11 @@
 #include <mpi.h>
 #include <stdio.h>
 
+// Add test-specific options
 void construct_opts(int argc, char **argv) {
   auto& arg_parser = lbann::global_argument_parser();
+  lbann::construct_std_options();
+  lbann::construct_datastore_options();
   arg_parser.add_option("samples",
                         {"-n"},
                         "Number of samples to run inference on",
@@ -52,20 +55,76 @@ void construct_opts(int argc, char **argv) {
                         "Number of labels in dataset",
                         10);
   arg_parser.add_option("minibatchsize",
-                        {"-mbs"},
+                        {"--mbs"},
                         "Number of samples in a mini-batch",
                         16);
+  arg_parser.add_flag("use_conduit",
+                        {"--conduit"},
+                        "Use Conduit node samples (Default is non-distributed matrix)");
+  arg_parser.add_flag("use_dist_matrix",
+                        {"--dist"},
+                        "Use Hydrogen distributed matrix (Default is non-distributed matrix)");
   arg_parser.add_required_argument<std::string>
                                   ("model",
                                    "Directory containing checkpointed model");
   arg_parser.parse(argc, argv);
 }
 
-El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
-random_samples(El::Grid const& g, int n, int c, int h, int w) {
+// Generates random samples and labels for mnist data in Hydrogen matrix
+std::map<
+  std::string,
+  El::Matrix<float, El::Device::CPU>>
+mat_mnist_samples(int n, int c, int h, int w)
+{
+  El::Matrix<float, El::Device::CPU>
+    samples(c * h * w, n);
+  El::MakeUniform(samples);
+  El::Matrix<float, El::Device::CPU>
+    labels(1, n);
+  El::MakeUniform(labels);
+  std::map<
+    std::string,
+    El::Matrix<float, El::Device::CPU>>
+    samples_map = {{"data/samples", samples}, {"data/labels", labels}};
+  return samples_map;
+}
+
+// Generates random samples and labels for mnist data in Hydrogen distributed matrix
+std::map<
+  std::string,
+  El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>>
+distmat_mnist_samples(El::Grid const& g, int n, int c, int h, int w)
+{
   El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
-    samples(n, c * h * w, g);
+    samples(c * h * w, n, g);
   El::MakeUniform(samples);
+  El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>
+    labels(1, n, g);
+  El::MakeUniform(labels);
+  std::map<
+    std::string,
+    El::DistMatrix<float, El::STAR, El::STAR, El::ELEMENT, El::Device::CPU>>
+    samples_map = {{"data/samples", samples}, {"data/labels", labels}};
+  return samples_map;
+}
+
+// Fills array with random values
+void random_fill(float *arr, int size, int max_val=255) {
+  for (int i; i < size; i++) {
+    arr[i] = (float)(std::rand() % max_val) / (float)max_val;
+  }
+}
+
+// Generates random samples and labels for mnist data in vector of Conduit nodes
+std::vector<conduit::Node> conduit_mnist_samples(int n, int c, int h, int w) {
+  std::vector<conduit::Node> samples(n);
+  int sample_size = c * h * w;
+  float this_sample[sample_size];
+  for (int i; i<n; i++) {
+    random_fill(this_sample, sample_size);
+    samples[i]["data/samples"].set(this_sample, sample_size);
+    samples[i]["data/labels"] = std::rand() % 10;
+  }
   return samples;
 }
 
@@ -79,10 +138,13 @@ int main(int argc, char **argv) {
   int rank;
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-  // Get input arguments and print values
+  // Get input arguments, check and print values
   construct_opts(argc, argv);
   auto& arg_parser = lbann::global_argument_parser();
   if (rank == 0) {
+    if (arg_parser.get<bool>("use_conduit") && arg_parser.get<bool>("use_dist_matrix")) {
+      LBANN_ERROR("Cannot use conduit node and distributed matrix together, choose one: --conduit --dist");
+    }
     std::stringstream msg;
     msg << "Model: " << arg_parser.get<std::string>("model") << std::endl;
     msg << "{ N, c, h, w } = { " << arg_parser.get<int>("samples") << ", ";
@@ -94,8 +156,8 @@ int main(int argc, char **argv) {
     std::cout << msg.str();
   }
 
-  // Load model and run inference on samples
   auto lbann_comm = lbann::initialize_lbann(MPI_COMM_WORLD);
+
   auto m = lbann::load_inference_model(lbann_comm.get(),
                                        arg_parser.get<std::string>("model"),
                                        arg_parser.get<int>("minibatchsize"),
@@ -105,14 +167,31 @@ int main(int argc, char **argv) {
                                          arg_parser.get<int>("width")
                                        },
                                        {arg_parser.get<int>("labels")});
-  auto samples = random_samples(lbann_comm->get_trainer_grid(),
-                                arg_parser.get<int>("samples"),
-                                arg_parser.get<int>("channels"),
-                                arg_parser.get<int>("height"),
-                                arg_parser.get<int>("width"));
-  auto labels = lbann::infer(m.get(),
-                             samples,
-                             arg_parser.get<int>("minibatchsize"));
+
+  // three options for data generation
+  if (arg_parser.get<bool>("use_conduit")) {
+    auto samples = conduit_mnist_samples(arg_parser.get<int>("samples"),
+                                         arg_parser.get<int>("channels"),
+                                         arg_parser.get<int>("height"),
+                                         arg_parser.get<int>("width"));
+    lbann::set_inference_samples(samples);
+  } else if (arg_parser.get<bool>("use_dist_matrix")) {
+    auto samples = distmat_mnist_samples(lbann_comm->get_trainer_grid(),
+                                          arg_parser.get<int>("samples"),
+                                          arg_parser.get<int>("channels"),
+                                          arg_parser.get<int>("height"),
+                                          arg_parser.get<int>("width"));
+    lbann::set_inference_samples(samples);
+  } else {
+    auto samples = mat_mnist_samples(
+                                          arg_parser.get<int>("samples"),
+                                          arg_parser.get<int>("channels"),
+                                          arg_parser.get<int>("height"),
+                                          arg_parser.get<int>("width"));
+    lbann::set_inference_samples(samples);
+  }
+
+  auto labels = lbann::inference(m.get());
 
   // Print inference results
   if (lbann_comm->am_world_master()) {

diff --git a/core-driver/run.sh b/core-driver/run.sh
@@ -0,0 +1,10 @@
+export AL_PROGRESS_RANKS_PER_NUMA_NODE=2
+export OMP_NUM_THREADS=8
+export MV2_USE_RDMA_CM=0
+
+# This should be a checkpointed lenet model
+MODEL_LOC="path/to/checkpointed/model"
+
+./Main $MODEL_LOC
+./Main $MODEL_LOC --dist
+./Main $MODEL_LOC --conduit
diff --git a/include/lbann/data_ingestion/readers/CMakeLists.txt b/include/lbann/data_ingestion/readers/CMakeLists.txt
@@ -29,6 +29,7 @@ set_full_path(THIS_DIR_HEADERS
   metadata.hpp
   # Data readers
   data_reader_cifar10.hpp
+  data_reader_conduit.hpp
   data_reader_csv.hpp
   data_reader_image.hpp
   data_reader_HDF5.hpp

diff --git a/include/lbann/data_ingestion/readers/data_reader_conduit.hpp b/include/lbann/data_ingestion/readers/data_reader_conduit.hpp
@@ -0,0 +1,72 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2021, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <[email protected]>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_DATA_READER_CONDUIT_HPP
+#define LBANN_DATA_READER_CONDUIT_HPP
+
+#include "lbann/data_readers/data_reader.hpp"
+#include "lbann/data_store/data_store_conduit.hpp"
+
+namespace lbann {
+/**
+ * A generalized data reader for passed in conduit nodes.
+ */
+class conduit_data_reader : public generic_data_reader
+{
+public:
+  conduit_data_reader* copy() const override { return new conduit_data_reader(*this); }
+  bool has_conduit_output() override { return true; }
+  void load() override;
+  bool fetch_conduit_node(conduit::Node& sample, int data_id) override;
+
+  void set_data_dims(std::vector<int> dims);
+  void set_label_dims(std::vector<int> dims);
+
+  std::string get_type() const override { return "conduit_data_reader"; }
+  int get_linearized_data_size() const override {
+    int data_size = 1;
+    for(int i : m_data_dims) {
+      data_size *= i;
+    }
+    return data_size;
+  }
+  int get_linearized_label_size() const override {
+    int label_size = 1;
+    for(int i : m_label_dims) {
+      label_size *= i;
+    }
+    return label_size;
+  }
+
+private:
+  std::vector<int> m_data_dims;
+  std::vector<int> m_label_dims;
+
+}; // END: class conduit_data_reader
+
+} // namespace lbann
+
+#endif // LBANN_DATA_READER_CONDUIT_HPP
diff --git a/include/lbann/execution_algorithms/batch_functional_inference_algorithm.hpp b/include/lbann/execution_algorithms/batch_functional_inference_algorithm.hpp
@@ -40,8 +40,7 @@ namespace lbann {
  *
  *  This execution algorithm is meant for running inference using a trained
  *  model and samples passed by the user from an external application.  The
- *  algorithm currently assumes that there is only 1 input layer in the model,
- *  and the output layer is a softmax layer.
+ *  algorithm currently assumes that the output layer is a softmax layer.
  */
 class batch_functional_inference_algorithm
 {
@@ -73,111 +72,16 @@ class batch_functional_inference_algorithm
 
   /** @brief Run model inference on samples and return predicted categories.
    * @param[in] model A trained model
-   * @param[in] samples A distributed matrix containing samples for model input
-   * @param[in] mbs The max mini-batch size
    * @return Matrix of predicted labels (by index)
    */
-  template <typename DataT,
-            El::Dist CDist,
-            El::Dist RDist,
-            El::DistWrap DistView,
-            El::Device Device>
-  El::Matrix<int, El::Device::CPU>
-  infer(observer_ptr<model> model,
-        El::DistMatrix<DataT, CDist, RDist, DistView, Device> const& samples,
-        size_t mbs)
-  {
-    if (mbs <= 0) {
-      LBANN_ERROR("mini-batch size must be larger than 0");
-    }
-
-    // Make matrix for returning predicted labels
-    size_t samples_size = samples.Height();
-    El::Matrix<int, El::Device::CPU> labels(samples_size, 1);
-
-    // BVE FIXME
-    // Create an SGD_execution_context so that layer.forward_prop can get the
-    // mini_batch_size - This should be fixed in the future, when SGD is not so
-    // hard-coded into the model & layers
-    auto c = SGDExecutionContext(execution_mode::inference);
-    model->reset_mode(c, execution_mode::inference);
-    // Explicitly set the size of the mini-batch that the model is executing
-    model->set_current_mini_batch_size(mbs);
-
-    // Infer on mini batches
-    for (size_t i = 0; i < samples_size; i += mbs) {
-      size_t mb_idx = std::min(i + mbs, samples_size);
-      auto mb_range = El::IR(i, mb_idx);
-      auto mb_samples = El::LockedView(samples, mb_range, El::ALL);
-      auto mb_labels = El::View(labels, mb_range, El::ALL);
-
-      infer_mini_batch(*model, mb_samples);
-      get_labels(*model, mb_labels);
-    }
-
-    return labels;
-  }
+  El::Matrix<El::Int, El::Device::CPU> infer(observer_ptr<model> model);
 
 protected:
-  /** @brief Run model inference on a single mini-batch of samples
-   * This method takes a mini-batch of samples, inserts them into the input
-   * layer of the model, and runs forward prop on the model.
-   * @param[in] model A trained model
-   * @param[in] samples A distributed matrix containing samples for model input
-   */
-  template <typename DataT,
-            El::Dist CDist,
-            El::Dist RDist,
-            El::DistWrap DistView,
-            El::Device Device>
-  void infer_mini_batch(
-    model& model,
-    El::DistMatrix<DataT, CDist, RDist, DistView, Device> const& samples)
-  {
-    for (int i = 0; i < model.get_num_layers(); i++) {
-      auto& l = model.get_layer(i);
-      // Insert samples into the input layer
-      if (l.get_type() == "input") {
-        auto& il = dynamic_cast<input_layer<DataType>&>(l);
-        il.set_samples(samples);
-      }
-    }
-    model.forward_prop(execution_mode::inference);
-  }
-
   /** @brief Finds the predicted category in a models softmax layer
    * @param[in] model A model that has been used for inference
    * @param[in] labels A matrix to place predicted category labels
    */
-  void get_labels(model& model, El::Matrix<int, El::Device::CPU>& labels)
-  {
-    int pred_label = 0;
-    float max, col_value;
-
-    for (const auto* l : model.get_layers()) {
-      // Find the output layer
-      if (l->get_type() == "softmax") {
-        auto const& dtl =
-          dynamic_cast<lbann::data_type_layer<float> const&>(*l);
-        const auto& outputs = dtl.get_activations();
-
-        // Find the prediction for each sample
-        int col_count = outputs.Width();
-        int row_count = outputs.Height();
-        for (int i = 0; i < col_count; i++) {
-          max = 0;
-          for (int j = 0; j < row_count; j++) {
-            col_value = outputs.Get(i, j);
-            if (col_value > max) {
-              max = col_value;
-              pred_label = j;
-            }
-          }
-          labels(i) = pred_label;
-        }
-      }
-    }
-  }
+  void get_labels(model& model, El::Matrix<El::Int, El::Device::CPU>& labels);
 };
 
 } // namespace lbann