Skip to content

Commit

Permalink
Add default pinned pool that falls back to new pinned allocations (#1…
Browse files Browse the repository at this point in the history
…5665)

Issue #15612

Adds a pooled pinned memory resource that is created on first call to `get_host_memory_resource` or `set_host_memory_resource`.
The pool has a fixed size: 0.5% of the device memory capacity, limited to 100MB. At 100MB, the pool takes ~30ms to initialize. Size of the pool can be overridden with environment variable `LIBCUDF_PINNED_POOL_SIZE`.
If an allocation cannot be done within the pool, a new pinned allocation is performed.
The allocator uses a stream from the global stream pool to initialize and perform synchronous operations (`allocate`/`deallocate`). Users of the resource don't need to be aware of this implementation detail as these operations synchronize before they are completed.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Alessandro Bellina (https://github.com/abellina)
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15665
  • Loading branch information
vuule authored May 20, 2024
1 parent 16e8625 commit 58f4526
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 15 deletions.
7 changes: 6 additions & 1 deletion cpp/include/cudf/detail/utilities/stream_pool.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -81,6 +81,11 @@ class cuda_stream_pool {
*/
cuda_stream_pool* create_global_cuda_stream_pool();

/**
* @brief Get the global stream pool.
*/
cuda_stream_pool& global_cuda_stream_pool();

/**
* @brief Acquire a set of `cuda_stream_view` objects and synchronize them to an event on another
* stream.
Expand Down
19 changes: 19 additions & 0 deletions cpp/include/cudf/io/memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

#include <rmm/resource_ref.hpp>

#include <optional>

namespace cudf::io {

/**
Expand All @@ -41,4 +43,21 @@ rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_r
*/
rmm::host_async_resource_ref get_host_memory_resource();

/**
* @brief Options to configure the default host memory resource
*/
struct host_mr_options {
std::optional<size_t> pool_size; ///< The size of the pool to use for the default host memory
///< resource. If not set, the default pool size is used.
};

/**
* @brief Configure the size of the default host memory resource.
*
* @throws cudf::logic_error if called after the default host memory resource has been created
*
* @param opts Options to configure the default host memory resource
*/
void config_default_host_memory_resource(host_mr_options const& opts);

} // namespace cudf::io
191 changes: 177 additions & 14 deletions cpp/src/io/utilities/config_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@

#include "config_utils.hpp"

#include <cudf/detail/utilities/stream_pool.hpp>
#include <cudf/io/memory_resource.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/export.hpp>

#include <rmm/cuda_device.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>
#include <rmm/mr/pinned_host_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

Expand Down Expand Up @@ -87,38 +90,198 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_

} // namespace nvcomp_integration

inline std::mutex& host_mr_lock()
} // namespace detail

namespace {
class fixed_pinned_pool_memory_resource {
using upstream_mr = rmm::mr::pinned_host_memory_resource;
using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;

private:
upstream_mr upstream_mr_{};
size_t pool_size_{0};
// Raw pointer to avoid a segfault when the pool is destroyed on exit
host_pooled_mr* pool_{nullptr};
void* pool_begin_{nullptr};
void* pool_end_{nullptr};
cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};

public:
fixed_pinned_pool_memory_resource(size_t size)
: pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
{
if (pool_size_ == 0) { return; }

// Allocate full size from the pinned pool to figure out the beginning and end address
pool_begin_ = pool_->allocate_async(pool_size_, stream_);
pool_end_ = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
pool_->deallocate_async(pool_begin_, pool_size_, stream_);
}

void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
{
if (bytes <= pool_size_) {
try {
return pool_->allocate_async(bytes, alignment, stream);
} catch (...) {
// If the pool is exhausted, fall back to the upstream memory resource
}
}

return upstream_mr_.allocate_async(bytes, alignment, stream);
}

void do_deallocate_async(void* ptr,
std::size_t bytes,
std::size_t alignment,
cuda::stream_ref stream) noexcept
{
if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
pool_->deallocate_async(ptr, bytes, alignment, stream);
} else {
upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
}
}

void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
{
return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
}

void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
{
return do_allocate_async(bytes, alignment, stream);
}

void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
{
auto const result = do_allocate_async(bytes, alignment, stream_);
stream_.wait();
return result;
}

void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
{
return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
}

void deallocate_async(void* ptr,
std::size_t bytes,
std::size_t alignment,
cuda::stream_ref stream) noexcept
{
return do_deallocate_async(ptr, bytes, alignment, stream);
}

void deallocate(void* ptr,
std::size_t bytes,
std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
{
deallocate_async(ptr, bytes, alignment, stream_);
stream_.wait();
}

bool operator==(fixed_pinned_pool_memory_resource const& other) const
{
return pool_ == other.pool_ and stream_ == other.stream_;
}

bool operator!=(fixed_pinned_pool_memory_resource const& other) const
{
return !operator==(other);
}

[[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
cuda::mr::device_accessible) noexcept
{
}

[[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
cuda::mr::host_accessible) noexcept
{
}
};

static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
cuda::mr::device_accessible,
cuda::mr::host_accessible>,
"");

} // namespace

CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional<size_t> config_size)
{
static fixed_pinned_pool_memory_resource mr = [config_size]() {
auto const size = [&config_size]() -> size_t {
if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
return std::atol(env_val);
}

if (config_size.has_value()) { return *config_size; }

size_t free{}, total{};
CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
// 0.5% of the total device memory, capped at 100MB
return std::min(total / 200, size_t{100} * 1024 * 1024);
}();

// rmm requires the pool size to be a multiple of 256 bytes
auto const aligned_size = (size + 255) & ~255;
CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);

// make the pool with max size equal to the initial size
return fixed_pinned_pool_memory_resource{aligned_size};
}();

static rmm::host_async_resource_ref mr_ref{mr};
return mr_ref;
}

CUDF_EXPORT std::mutex& host_mr_mutex()
{
static std::mutex map_lock;
return map_lock;
}

inline rmm::host_async_resource_ref default_pinned_mr()
// Must be called with the host_mr_mutex mutex held
CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts)
{
static rmm::mr::pinned_host_memory_resource default_mr{};
return default_mr;
static rmm::host_async_resource_ref* mr_ref = nullptr;
if (mr_ref == nullptr) {
mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
} else {
// Throw an error if the user tries to reconfigure the default host resource
CUDF_EXPECTS(opts == std::nullopt, "The default host memory resource has already been created");
}

return *mr_ref;
}

CUDF_EXPORT inline auto& host_mr()
// Must be called with the host_mr_mutex mutex held
CUDF_EXPORT rmm::host_async_resource_ref& host_mr()
{
static rmm::host_async_resource_ref host_mr = default_pinned_mr();
return host_mr;
static rmm::host_async_resource_ref mr_ref = make_host_mr(std::nullopt);
return mr_ref;
}

} // namespace detail

rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr)
{
std::lock_guard lock{detail::host_mr_lock()};
auto last_mr = detail::host_mr();
detail::host_mr() = mr;
std::scoped_lock lock{host_mr_mutex()};
auto last_mr = host_mr();
host_mr() = mr;
return last_mr;
}

rmm::host_async_resource_ref get_host_memory_resource()
{
std::lock_guard lock{detail::host_mr_lock()};
return detail::host_mr();
std::scoped_lock lock{host_mr_mutex()};
return host_mr();
}

void config_default_host_memory_resource(host_mr_options const& opts)
{
std::scoped_lock lock{host_mr_mutex()};
make_host_mr(opts);
}

} // namespace cudf::io

0 comments on commit 58f4526

Please sign in to comment.