-
Notifications
You must be signed in to change notification settings - Fork 96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DRAFT : Register memory #1010
base: master
Are you sure you want to change the base?
DRAFT : Register memory #1010
Changes from 10 commits
c6d49ef
43faffd
0441b48
a22f47c
44d21ea
e14d2b1
8ef02c3
3417737
6589983
4cbdc7e
00a9097
3ca742c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
#include "coll_patterns/sra_knomial.h" | ||
#include "utils/ucc_math.h" | ||
#include "utils/ucc_coll_utils.h" | ||
#include <stdio.h> | ||
|
||
#define SAVE_STATE(_phase) \ | ||
do { \ | ||
|
@@ -50,6 +51,9 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) | |
args->root : 0; | ||
ucc_rank_t rank = VRANK(task->subset.myrank, broot, size); | ||
size_t local = GET_LOCAL_COUNT(args, size, rank); | ||
ucp_mem_h *mh_list = task->mh_list; | ||
int max_count = task->count_mh; | ||
int count_mh = 0; | ||
void *sbuf; | ||
ptrdiff_t peer_seg_offset, local_seg_offset; | ||
ucc_rank_t peer, peer_dist; | ||
|
@@ -60,37 +64,42 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) | |
|
||
EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test", | ||
task->allgather_kn.etask); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove |
||
task->allgather_kn.etask = NULL; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a BUG - you can't remove it. its needed |
||
UCC_KN_GOTO_PHASE(task->allgather_kn.phase); | ||
if (KN_NODE_EXTRA == node_type) { | ||
peer = ucc_knomial_pattern_get_proxy(p, rank); | ||
if (p->type != KN_PATTERN_ALLGATHERX) { | ||
UCPCHECK_GOTO(ucc_tl_ucp_send_nb(task->allgather_kn.sbuf, | ||
UCPCHECK_GOTO(ucc_tl_ucp_send_nb_with_mem(task->allgather_kn.sbuf, | ||
local * dt_size, mem_type, | ||
ucc_ep_map_eval(task->subset.map, | ||
INV_VRANK(peer,broot,size)), | ||
team, task), | ||
team, task, mh_list[count_mh++]), | ||
task, out); | ||
ucc_assert(count_mh >= max_count); | ||
|
||
} | ||
UCPCHECK_GOTO(ucc_tl_ucp_recv_nb(rbuf, data_size, mem_type, | ||
UCPCHECK_GOTO(ucc_tl_ucp_send_nb_with_mem(rbuf, data_size, mem_type, | ||
ucc_ep_map_eval(task->subset.map, | ||
INV_VRANK(peer,broot,size)), | ||
team, task), | ||
team, task, mh_list[count_mh++]), | ||
task, out); | ||
ucc_assert(count_mh >= max_count); | ||
} | ||
if ((p->type != KN_PATTERN_ALLGATHERX) && (node_type == KN_NODE_PROXY)) { | ||
peer = ucc_knomial_pattern_get_extra(p, rank); | ||
extra_count = GET_LOCAL_COUNT(args, size, peer); | ||
peer = ucc_ep_map_eval(task->subset.map, peer); | ||
UCPCHECK_GOTO(ucc_tl_ucp_recv_nb(PTR_OFFSET(task->allgather_kn.sbuf, | ||
UCPCHECK_GOTO(ucc_tl_ucp_recv_nb_with_mem(PTR_OFFSET(task->allgather_kn.sbuf, | ||
local * dt_size), extra_count * dt_size, | ||
mem_type, peer, team, task), | ||
mem_type, peer, team, task, mh_list[count_mh++]), | ||
task, out); | ||
ucc_assert(count_mh >= max_count); | ||
} | ||
|
||
UCC_KN_PHASE_EXTRA: | ||
if ((KN_NODE_EXTRA == node_type) || (KN_NODE_PROXY == node_type)) { | ||
if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { | ||
if (UCC_INPROGRESS == ucc_tl_ucp_test_with_etasks(task)) { | ||
SAVE_STATE(UCC_KN_PHASE_EXTRA); | ||
return; | ||
} | ||
|
@@ -114,12 +123,13 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) | |
continue; | ||
} | ||
} | ||
UCPCHECK_GOTO(ucc_tl_ucp_send_nb(sbuf, local_seg_count * dt_size, | ||
UCPCHECK_GOTO(ucc_tl_ucp_send_nb_with_mem(sbuf, local_seg_count * dt_size, | ||
mem_type, | ||
ucc_ep_map_eval(task->subset.map, | ||
INV_VRANK(peer, broot, size)), | ||
team, task), | ||
team, task, mh_list[count_mh++]), | ||
task, out); | ||
ucc_assert(count_mh >= max_count); | ||
} | ||
|
||
for (loop_step = 1; loop_step < radix; loop_step++) { | ||
|
@@ -137,15 +147,16 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) | |
} | ||
} | ||
UCPCHECK_GOTO( | ||
ucc_tl_ucp_recv_nb(PTR_OFFSET(rbuf, peer_seg_offset * dt_size), | ||
ucc_tl_ucp_recv_nb_with_mem(PTR_OFFSET(rbuf, peer_seg_offset * dt_size), | ||
peer_seg_count * dt_size, mem_type, | ||
ucc_ep_map_eval(task->subset.map, | ||
INV_VRANK(peer, broot, size)), | ||
team, task), | ||
team, task, mh_list[count_mh++]), | ||
task, out); | ||
ucc_assert(count_mh >= max_count); | ||
} | ||
UCC_KN_PHASE_LOOP: | ||
if (UCC_INPROGRESS == ucc_tl_ucp_test_recv(task)) { | ||
if (UCC_INPROGRESS == ucc_tl_ucp_test_recv_with_etasks(task)) { | ||
SAVE_STATE(UCC_KN_PHASE_LOOP); | ||
return; | ||
} | ||
|
@@ -154,15 +165,16 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) | |
|
||
if (KN_NODE_PROXY == node_type) { | ||
peer = ucc_knomial_pattern_get_extra(p, rank); | ||
UCPCHECK_GOTO(ucc_tl_ucp_send_nb(args->dst.info.buffer, data_size, | ||
UCPCHECK_GOTO(ucc_tl_ucp_send_nb_with_mem(args->dst.info.buffer, data_size, | ||
mem_type, | ||
ucc_ep_map_eval(task->subset.map, | ||
INV_VRANK(peer, broot, size)), | ||
team, task), | ||
team, task, mh_list[count_mh++]), | ||
task, out); | ||
ucc_assert(count_mh >= max_count); | ||
} | ||
UCC_KN_PHASE_PROXY: | ||
if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { | ||
if (UCC_INPROGRESS == ucc_tl_ucp_test_with_etasks(task)) { | ||
SAVE_STATE(UCC_KN_PHASE_PROXY); | ||
return; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add in out |
||
|
@@ -193,7 +205,6 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task) | |
|
||
UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_kn_start", 0); | ||
ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); | ||
task->allgather_kn.etask = NULL; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BUG - why removed? |
||
task->allgather_kn.phase = UCC_KN_PHASE_INIT; | ||
if (ct == UCC_COLL_TYPE_ALLGATHER) { | ||
ucc_kn_ag_pattern_init(size, rank, radix, args->dst.info.count, | ||
|
@@ -234,6 +245,123 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task) | |
return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); | ||
} | ||
|
||
void register_memory(ucc_coll_task_t *coll_task){ | ||
|
||
ucc_tl_ucp_task_t *task = ucc_derived_of(coll_task, | ||
ucc_tl_ucp_task_t); | ||
ucc_coll_args_t *args = &TASK_ARGS(task); | ||
ucc_tl_ucp_team_t *team = TASK_TEAM(task); | ||
ucc_kn_radix_t radix = task->allgather_kn.p.radix; | ||
uint8_t node_type = task->allgather_kn.p.node_type; | ||
ucc_knomial_pattern_t *p = &task->allgather_kn.p; | ||
void *rbuf = args->dst.info.buffer; | ||
ucc_memory_type_t mem_type = args->dst.info.mem_type; | ||
size_t count = args->dst.info.count; | ||
size_t dt_size = ucc_dt_size(args->dst.info.datatype); | ||
size_t data_size = count * dt_size; | ||
ucc_rank_t size = task->subset.map.ep_num; | ||
ucc_rank_t broot = args->coll_type == UCC_COLL_TYPE_BCAST ? | ||
args->root : 0; | ||
ucc_rank_t rank = VRANK(task->subset.myrank, broot, size); | ||
size_t local = GET_LOCAL_COUNT(args, size, rank); | ||
void *sbuf; | ||
ptrdiff_t peer_seg_offset, local_seg_offset; | ||
ucc_rank_t peer, peer_dist; | ||
ucc_kn_radix_t loop_step; | ||
size_t peer_seg_count, local_seg_count; | ||
ucc_status_t status; | ||
size_t extra_count; | ||
|
||
ucc_tl_ucp_context_t *ctx = UCC_TL_UCP_TEAM_CTX(team); | ||
ucp_mem_map_params_t mmap_params; | ||
ucp_mem_h mh; | ||
int size_of_list = 1; | ||
int count_mh = 0; | ||
ucp_mem_h *mh_list = (ucp_mem_h *)malloc(size_of_list * sizeof(ucp_mem_h)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. u never called free There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where should this be ? In the end of the register function ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in finalize There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also where ? |
||
|
||
mmap_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | | ||
UCP_MEM_MAP_PARAM_FIELD_LENGTH | | ||
UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE; | ||
mmap_params.memory_type = ucc_memtype_to_ucs[mem_type]; | ||
|
||
if (KN_NODE_EXTRA == node_type) { | ||
if (p->type != KN_PATTERN_ALLGATHERX) { | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need for all of the line spaces |
||
mmap_params.address = task->allgather_kn.sbuf; | ||
mmap_params.length = local * dt_size; | ||
MEM_MAP(); | ||
} | ||
|
||
mmap_params.address = rbuf; | ||
mmap_params.length = data_size; | ||
MEM_MAP(); | ||
} | ||
if ((p->type != KN_PATTERN_ALLGATHERX) && (node_type == KN_NODE_PROXY)) { | ||
peer = ucc_knomial_pattern_get_extra(p, rank); | ||
extra_count = GET_LOCAL_COUNT(args, size, peer); | ||
peer = ucc_ep_map_eval(task->subset.map, peer); | ||
mmap_params.address = PTR_OFFSET(task->allgather_kn.sbuf, | ||
local * dt_size); | ||
mmap_params.length = extra_count * dt_size; | ||
MEM_MAP(); | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
if (KN_NODE_EXTRA == node_type) { | ||
goto out; | ||
} | ||
while (!ucc_knomial_pattern_loop_done(p)) { | ||
ucc_kn_ag_pattern_peer_seg(rank, p, &local_seg_count, | ||
&local_seg_offset); | ||
sbuf = PTR_OFFSET(rbuf, local_seg_offset * dt_size); | ||
|
||
for (loop_step = radix - 1; loop_step > 0; loop_step--) { | ||
peer = ucc_knomial_pattern_get_loop_peer(p, rank, loop_step); | ||
if (peer == UCC_KN_PEER_NULL) | ||
continue; | ||
if (coll_task->bargs.args.coll_type == UCC_COLL_TYPE_BCAST) { | ||
peer_dist = ucc_knomial_calc_recv_dist(size - p->n_extra, | ||
ucc_knomial_pattern_loop_rank(p, peer), p->radix, 0); | ||
if (peer_dist < task->allgather_kn.recv_dist) { | ||
continue; | ||
} | ||
} | ||
mmap_params.address = sbuf; | ||
mmap_params.length = local_seg_count * dt_size; | ||
MEM_MAP(); | ||
} | ||
|
||
for (loop_step = 1; loop_step < radix; loop_step++) { | ||
peer = ucc_knomial_pattern_get_loop_peer(p, rank, loop_step); | ||
if (peer == UCC_KN_PEER_NULL) | ||
continue; | ||
ucc_kn_ag_pattern_peer_seg(peer, p, &peer_seg_count, | ||
&peer_seg_offset); | ||
|
||
if (coll_task->bargs.args.coll_type == UCC_COLL_TYPE_BCAST) { | ||
peer_dist = ucc_knomial_calc_recv_dist(size - p->n_extra, | ||
ucc_knomial_pattern_loop_rank(p, peer), p->radix, 0); | ||
if (peer_dist > task->allgather_kn.recv_dist) { | ||
continue; | ||
} | ||
} | ||
mmap_params.address = PTR_OFFSET(rbuf, peer_seg_offset * dt_size); | ||
mmap_params.length = peer_seg_count * dt_size; | ||
MEM_MAP(); | ||
} | ||
ucc_kn_ag_pattern_next_iter(p); | ||
} | ||
|
||
if (KN_NODE_PROXY == node_type) { | ||
mmap_params.address = args->dst.info.buffer; | ||
mmap_params.length = data_size; | ||
MEM_MAP(); | ||
} | ||
|
||
out: | ||
task->mh_list = mh_list; | ||
task->count_mh = count_mh-1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because I'm always increamenting count_mh after using it so the last one is by 1 to high |
||
} | ||
|
||
ucc_status_t ucc_tl_ucp_allgather_knomial_init_r( | ||
ucc_base_coll_args_t *coll_args, ucc_base_team_t *team, | ||
ucc_coll_task_t **task_h, ucc_kn_radix_t radix) | ||
|
@@ -243,12 +371,18 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_init_r( | |
ucc_sbgp_t *sbgp; | ||
|
||
task = ucc_tl_ucp_init_task(coll_args, team); | ||
ucc_mpool_init(&task->allgather_kn.etask_node_mpool, 0, sizeof(node_ucc_ee_executor_task_t), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. u didn't check for status There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what should I do if status<0 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. u never called mpool finalize There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didnt find anything called mpool finalize, what are you talking about ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ucc_mpool_cleanup in knomial finalize or similar There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see anything close to finalize inside allgather_knomial.c, is it elsewhere ? |
||
0, UCC_CACHE_LINE_SIZE, 16, UINT_MAX, NULL, | ||
tl_team->super.super.context->ucc_context->thread_mode, "etasks_linked_list_nodes"); | ||
|
||
if (tl_team->cfg.use_reordering && | ||
coll_args->args.coll_type == UCC_COLL_TYPE_ALLREDUCE) { | ||
sbgp = ucc_topo_get_sbgp(tl_team->topo, UCC_SBGP_FULL_HOST_ORDERED); | ||
task->subset.myrank = sbgp->group_rank; | ||
task->subset.map = sbgp->map; | ||
} | ||
register_memory(&task->super); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. u didn't check for status |
||
task->allgather_kn.etask_linked_list_head = NULL; | ||
task->allgather_kn.p.radix = radix; | ||
task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; | ||
task->super.post = ucc_tl_ucp_allgather_knomial_start; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BUG - this is a mistake, because u want to keep it when u go in and out from progress
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
right but so how do I initialize it ?