Skip to content

Commit

Permalink
Init micro benchmark of general CPU bandwidth.
Browse files Browse the repository at this point in the history
  • Loading branch information
hongtaozhang committed Nov 12, 2024
2 parents 2778e37 + f67ec84 commit 7c3dcd7
Show file tree
Hide file tree
Showing 3 changed files with 457 additions and 16 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

cmake_minimum_required(VERSION 3.18)

project(cpu_copy LANGUAGES CXX)

find_package(CUDAToolkit QUIET)

# Cuda environment
if(CUDAToolkit_FOUND)
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})

include(../cuda_common.cmake)
add_executable(cpu_copy cpu_copy.cu)
set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_link_libraries(cpu_copy numa)
else()
# ROCm environment
include(../rocm_common.cmake)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION})

# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)

# link hip device lib
add_executable(cpu_copy cpu_copy.cpp)

include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
target_link_libraries(cpu_copy numa hip::device)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif()

install(TARGETS cpu_copy RUNTIME DESTINATION bin)
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
#include <chrono>
#include <cstring> // for memcpy
#include <getopt.h>
#include <iomanip> // for setting precision
#include <iostream>
#include <numa.h>
#include <numeric>
#include <vector>

// Options accepted by this program.
struct Opts {
// Data buffer size for copy benchmark.
uint64_t size = 0;

// Number of warm up rounds to run.
uint64_t num_warm_up = 0;

// Number of loops to run.
uint64_t num_loops = 0;

// Whether check data after copy.
bool check_data = false;
};

/**
* @brief Print the usage instructions for this program.
*
* This function outputs the correct way to execute the program,
* including any necessary command-line arguments and their descriptions.
*/
void PrintUsage() {
std::cout << "Usage: gpu_copy "
<< "--size <size> "
<< "--num_warm_up <num_warm_up> "
<< "--num_loops <num_loops> "
<< "[--check_data]" << std::endl;
}

/**
* @brief Checks if the system has memory available for a specific NUMA node.
*
* This function determines whether there is sufficient memory available on the specified
* NUMA (Non-Uniform Memory Access) node. It is useful for ensuring that memory allocation
* requests can be satisfied by the desired NUMA node, which can help optimize memory access
* patterns and performance in NUMA-aware applications.
*
* @param node_id The identifier of the NUMA node to check.
* @param required_memory The amount of memory required (in bytes).
* @return true if the specified NUMA node has sufficient memory available, false otherwise.
*/
bool HasMemForNumaNode(int node) {
try {
long free_memory = numa_node_size64(node, nullptr);
return free_memory > 0;
} catch (const std::exception &e) {
std::cerr << "Failed to get memory size for NUMA node " << node << ". ERROR: " << e.what() << std::endl;
return false;
}
}

/**
* @brief Checks if the system has CPUs available for a specific NUMA node.
*
* This function determines whether there are CPUs available on the specified
* NUMA (Non-Uniform Memory Access) node. It is useful for ensuring that CPU
* affinity can be set to the desired NUMA node, which can help optimize memory
* access patterns and performance in NUMA-aware applications.
*
* @param node The identifier of the NUMA node to check.
* @return true if the specified NUMA node has CPUs available, false otherwise.
*/
bool HasCPUsForNumaNode(int node) {
struct bitmask *bm = numa_allocate_cpumask();

int numa_err = numa_node_to_cpus(node, bm);
if (numa_err != 0) {
fprintf(stderr, "HasCPUsForNumaNode::numa_node_to_cpus error on node: %d, code: %d, message: %s\n", node, errno,
strerror(errno));

numa_bitmask_free(bm);
return false; // On error
}

// Check if any CPU is assigned to the NUMA node, has_cpus is false for mem only numa nodes
bool has_cpus = (numa_bitmask_weight(bm) > 0);
numa_bitmask_free(bm);
return has_cpus;
}

/**
* @brief Parses command-line options for the CPU copy performance benchmark.
*
* This function processes the command-line arguments provided to the benchmark
* and sets the appropriate configuration options based on the input.
*
* @param argc The number of command-line arguments.
* @param argv The array of command-line arguments.
* @return An integer indicating the success or failure of the option parsing.
* Returns 0 on success, and a non-zero value on failure.
*/
/**/
int ParseOpts(int argc, char **argv, Opts *opts) {
enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
{"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
{"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
int getopt_ret = 0;
int opt_idx = 0;
bool size_specified = false;
bool num_warm_up_specified = false;
bool num_loops_specified = false;
bool parse_err = false;

while (true) {
getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
if (getopt_ret == -1) {
if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
parse_err = true;
}
break;
} else if (getopt_ret == '?') {
parse_err = true;
break;
}
switch (opt_idx) {
case static_cast<int>(OptIdx::kSize):
if (1 != sscanf(optarg, "%lu", &(opts->size))) {
std::cerr << "Invalid size: " << optarg << std::endl;
parse_err = true;
} else {
size_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumWarmUp):
if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
parse_err = true;
} else {
num_warm_up_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumLoops):
if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
std::cerr << "Invalid num_loops: " << optarg << std::endl;
parse_err = true;
} else {
num_loops_specified = true;
}
break;
case static_cast<int>(OptIdx::kEnableCheckData):
opts->check_data = true;
break;
default:
parse_err = true;
}
if (parse_err) {
break;
}
}

if (parse_err) {
PrintUsage();
return -1;
}

return 0;
}

/**
* @brief Benchmark the memory copy performance between two NUMA nodes.
*
* This function measures the performance of copying memory from a source NUMA node to a destination NUMA node.
*
* @param src_node The source NUMA node from which memory will be copied.
* @param dst_node The destination NUMA node to which memory will be copied.
* @param opts A reference to an Opts structure containing various options and configurations for the benchmark.
* @return The performance metric of the memory copy operation, typically in terms of bandwidth or latency.
*/
double BenchmarkNUMACopy(int src_node, int dst_node, Opts &opts) {
int ret = 0;

// Set CPU affinity to the NUMA node with CPU cores assoiated
int affinity_node = HasCPUsForNumaNode(src_node) ? src_node : dst_node;
ret = numa_run_on_node(affinity_node);
if (ret != 0) {
std::cerr << "Failed to set CPU affinity to NUMA node " << src_node << std::endl;
return 0;
}

// Allocate memory on the source and destination NUMA nodes
char *src = (char *)numa_alloc_onnode(opts.size, src_node);
if (!src) {
std::cerr << "Memory allocation failed on node" << src_node << std::endl;
return 0;
}

char *dst = (char *)numa_alloc_onnode(opts.size, dst_node);
if (!dst) {
std::cerr << "Memory allocation failed on node" << dst_node << std::endl;
return 0;
}

// Initialize the source memory with some data
memset(src, 1, opts.size);

// Measure the time taken for memcpy between nodes
auto start = std::chrono::high_resolution_clock::now();

// Perform the memory copy
memcpy(dst, src, opts.size);

auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;

// Calculate the latency (nanoseconds per byte)
double total_time_ns = diff.count() * 1e9; // Convert seconds to nanoseconds

// Free the allocated memory
numa_free(src, opts.size);
numa_free(dst, opts.size);

if (opts.check_data) {
// Check the data integrity after the copy
if (memcmp(src, dst, opts.size) != 0) {
std::cerr << "Data integrity check failed!" << dst_node << std::endl;

return -1;
}
}

return total_time_ns;
}

/**
* @brief Runs the CPU copy benchmark between all pairs of NUMA nodes.
*
* This function runs the CPU copy benchmark between all pairs of NUMA nodes in the system.
* It calculates the average bandwidth and latency for each pair of nodes and outputs the results.
*
* @param src_node The source NUMA node from which data will be copied.
* @param dst_node The destination NUMA node to which data will be copied.
* @param opts A reference to an Opts object containing various options and configurations for the benchmark.
*/
double RunCPUCopyBenchmark(int src_node, int dst_node, Opts &opts) {
double max_time_ns = 0;

// Run warm up rounds
for (int i = 0; i < opts.num_warm_up; i++) {
BenchmarkNUMACopy(src_node, dst_node, opts);
}

for (int i = 0; i < opts.num_loops; i++) {
double time_used_ns = BenchmarkNUMACopy(src_node, dst_node, opts);
max_time_ns = std::max(max_time_ns, time_used_ns);
}

return max_time_ns;
}

int main(int argc, char **argv) {
Opts opts;
int ret = -1;
ret = ParseOpts(argc, argv, &opts);
if (0 != ret) {
return ret;
}

// Check if the system has multiple NUMA nodes
if (-1 == numa_available()) {
std::cerr << "NUMA is not available on this system!" << std::endl;
return 1;
}

int num_of_numa_nodes = numa_num_configured_nodes();

if (num_of_numa_nodes < 2) {
std::cerr << "System has less than 2 NUMA nodes. Benchmark is not applicable." << std::endl;
return 1;
}

// Run the benchmark
for (int src_node = 0; src_node < num_of_numa_nodes; src_node++) {
if (!HasMemForNumaNode(src_node)) {
// Skip the NUMA node if there are no CPUs available
continue;
}

for (int dst_node = 0; dst_node < num_of_numa_nodes; dst_node++) {
if (src_node == dst_node) {
// Skip the same NUMA node
continue;
}

if (!HasMemForNumaNode(dst_node)) {
// Skip the NUMA node if there are no CPUs available
continue;
}

//
if (!HasCPUsForNumaNode(src_node) && !HasCPUsForNumaNode(dst_node)) {
// Skip the process if there are no CPUs available on both NUMA nodes
continue;
}

double time_used_ns = RunCPUCopyBenchmark(src_node, dst_node, opts);
double bw = opts.size / (time_used_ns / 1e9) / 1e6; // MB/s
double latency = time_used_ns / opts.size; // ns/byte

// Output the result
std::cout << "cpu_copy_bw/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9) << bw
<< std::endl;
std::cout << "cpu_copy_latency/node" << src_node << "_to_node" << dst_node << ": " << std::setprecision(9)
<< latency << std::endl;
}
}

return 0;
}
Loading

0 comments on commit 7c3dcd7

Please sign in to comment.