Skip to content

Commit

Permalink
tuner: replacing model with regions-based tuner
Browse files Browse the repository at this point in the history
This commit is replacing the model-based tuner with a regions-based one.
The tuner is now using a set of regions to choose what algorithm and protocol
combination to use. A region is an ordered list of vertices defining a polygon
in the 2D space (Message size X number of ranks). A region covers all the
points where the corresponding algorithm+protocol combination should be chosen,
and every combination that can be chosen has its region.
When NCCL invokes get_coll_info() the tuner scans all the regions and finds the
first one which has the specific (message size, number of ranks) point inside
or on its edge. Therefore, regions can overlap and their order is important.
We use the ray-tracing algorithm to find if a point belongs to a region.
Moreover, we extend regions for larger message sizes and higher number of ranks
by extending the external facing sides.
Any point that does not belong to any region will result in falling back to
NCCL's internal tuner.
We also have different sets of regions for different ratios of
(num_ranks/num_nodes).

Signed-off-by: Amedeo Sapio <[email protected]>
  • Loading branch information
AmedeoSapio authored and a-szegel committed Jul 24, 2024
1 parent 2c7eff0 commit bacae0c
Show file tree
Hide file tree
Showing 11 changed files with 721 additions and 845 deletions.
1 change: 0 additions & 1 deletion include/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ noinst_HEADERS = \
nccl_ofi_tracepoint.h \
tracing_impl/lttng.h \
tracing_impl/nvtx.h \
internal/tuner/algo/allreduce/ring.h \
internal/tuner/nccl_defaults.h \
nccl-headers/net.h \
nccl-headers/error.h \
Expand Down
272 changes: 0 additions & 272 deletions include/internal/tuner/algo/allreduce/ring.h

This file was deleted.

72 changes: 45 additions & 27 deletions include/nccl_ofi_tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,34 +12,52 @@
#include "nccl-headers/nvidia/tuner.h"
#include "nccl_ofi_param.h"

struct nccl_ofi_tuner_model_params {
float net_lat;
float internode_bw;
float intranode_bw;
int num_rails;
uint64_t nccl_buffsize;
};

struct nccl_ofi_tuner_model_dims {
/* Maximum number of vertices per region */
#define TUNER_MAX_NUM_VERTICES 20

/* Maximum number of ranks with which the tuner can deal.
* Above this value, it will fall back to NCCL's tuner.
*/
#define TUNER_MAX_RANKS 1024.0 * 1024

/* Maximum message size with which the tuner can deal.
* Above this value, it will fall back to NCCL's tuner.
*/
#define TUNER_MAX_SIZE 100.0 * 1024 * 1024 * 1024

typedef struct nccl_ofi_tuner_model_dims {
/* communicator size */
int num_ranks;
int num_nodes;
};

struct nccl_ofi_tuner_context {
struct nccl_ofi_tuner_model_dims dims;
struct nccl_ofi_tuner_model_params model_params;
};

/* Modeling functions */
double nccl_ofi_tuner_compute_cost(struct nccl_ofi_tuner_model_dims const *dims,
struct nccl_ofi_tuner_model_params const *params,
ncclFunc_t func,
int algo,
int proto,
int pipe_ops,
size_t nChan,
size_t size);
size_t num_ranks;
size_t num_nodes;
} nccl_ofi_tuner_model_dims_t;

typedef struct nccl_ofi_tuner_point {
double x;
double y;
} nccl_ofi_tuner_point_t;

typedef struct nccl_ofi_tuner_region {
int algorithm;
int protocol;
size_t num_vertices;
nccl_ofi_tuner_point_t vertices[TUNER_MAX_NUM_VERTICES];
} nccl_ofi_tuner_region_t;

typedef struct nccl_ofi_tuner_context {
nccl_ofi_tuner_model_dims_t dims;
size_t num_regions;
nccl_ofi_tuner_region_t *regions;
} nccl_ofi_tuner_context_t;

/* Functions to set and test regions */
int is_inside_region(nccl_ofi_tuner_point_t point, nccl_ofi_tuner_region_t *region);

ncclResult_t set_regions(nccl_ofi_tuner_context_t *nccl_ofi_tuner_ctx,
size_t num_regions,
const nccl_ofi_tuner_region_t regions[],
size_t regions_size);

nccl_ofi_tuner_point_t extend_region(nccl_ofi_tuner_point_t a, nccl_ofi_tuner_point_t b, nccl_ofi_tuner_point_t z);

/* In the original introduction of the external tuner v2 struct, NCCL did not
* enumerate downwards through versions and attempt to load the first valid
Expand Down
2 changes: 1 addition & 1 deletion src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ endif
#
noinst_LTLIBRARIES += libinternal_tuner_plugin.la
tuner_sources = \
tuner/nccl_ofi_model.c \
tuner/nccl_ofi_regions.c \
tuner/nccl_ofi_tuner.c
libinternal_tuner_plugin_la_SOURCES = $(tuner_sources)
libinternal_tuner_plugin_la_LDFLAGS = -avoid-version
Expand Down
Loading

0 comments on commit bacae0c

Please sign in to comment.