Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Update the response queue in the server to reuse response slots #7879

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion qa/L0_decoupled/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -127,6 +127,45 @@ for trial in $TRIALS; do

kill $SERVER_PID
wait $SERVER_PID

SERVER_ARGS="--model-repository=$MODELDIR --grpc-max-response-pool-size=1"
SERVER_LOG="grpc_max_response_pool_size_1_${trial}_server.log"
CLIENT_LOG="grpc_max_response_pool_size_1_${trial}_client.log"
run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

for test in \
test_one_to_none \
test_one_to_one \
test_one_to_many \
test_no_streaming \
test_response_order \
test_wrong_shape; do

echo "Test: $test" >>$CLIENT_LOG
set +e
python $DECOUPLED_TEST DecoupledTest.$test >>$CLIENT_LOG 2>&1
if [ $? -ne 0 ]; then
echo -e "\n***\n*** Test grpc-max-response-pool-size=1 ${trial} - $test Failed\n***" >>$CLIENT_LOG
echo -e "\n***\n*** Test grpc-max-response-pool-size=1 ${trial} - $test Failed\n***"
RET=1
else
check_test_results $TEST_RESULT_FILE 1
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification Failed\n***"
RET=1
fi
fi
set -e
done

kill $SERVER_PID
wait $SERVER_PID
done

# Test the server frontend can merge the responses of non-decoupled model that
Expand Down
16 changes: 15 additions & 1 deletion src/command_line_parser.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -306,6 +306,7 @@ enum TritonOptionId {
OPTION_GRPC_ADDRESS,
OPTION_GRPC_HEADER_FORWARD_PATTERN,
OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,
OPTION_GRPC_MAX_RESPONSE_POOL_SIZE,
OPTION_GRPC_USE_SSL,
OPTION_GRPC_USE_SSL_MUTUAL,
OPTION_GRPC_SERVER_CERT,
Expand Down Expand Up @@ -536,6 +537,11 @@ TritonParser::SetupOptions()
"allocated for reuse. As long as the number of in-flight requests "
"doesn't exceed this value there will be no allocation/deallocation of "
"request/response objects."});
grpc_options_.push_back(
{OPTION_GRPC_MAX_RESPONSE_POOL_SIZE, "grpc-max-response-pool-size",
Option::ArgInt,
"The maximum number of inference response objects that can remain "
"allocated in the pool at any given time."});
grpc_options_.push_back(
{OPTION_GRPC_USE_SSL, "grpc-use-ssl", Option::ArgBool,
"Use SSL authentication for GRPC requests. Default is false."});
Expand Down Expand Up @@ -1438,6 +1444,14 @@ TritonParser::Parse(int argc, char** argv)
case OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE:
lgrpc_options.infer_allocation_pool_size_ = ParseOption<int>(optarg);
break;
case OPTION_GRPC_MAX_RESPONSE_POOL_SIZE:
lgrpc_options.max_response_pool_size_ = ParseOption<int>(optarg);
if (lgrpc_options.max_response_pool_size_ <= 0) {
throw ParseException(
"Error: --grpc-max-response-pool-size must be greater "
"than 0.");
}
break;
case OPTION_GRPC_USE_SSL:
lgrpc_options.ssl_.use_ssl_ = ParseOption<bool>(optarg);
break;
Expand Down
12 changes: 7 additions & 5 deletions src/grpc/grpc_server.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -2395,8 +2395,8 @@ Server::Server(
"ModelInferHandler", tritonserver_, trace_manager_, shm_manager_,
&service_, model_infer_cq_.get(),
options.infer_allocation_pool_size_ /* max_state_bucket_count */,
options.infer_compression_level_, restricted_kv,
options.forward_header_pattern_));
options.max_response_pool_size_, options.infer_compression_level_,
restricted_kv, options.forward_header_pattern_));
}

// Handler for streaming inference requests. Keeps one handler for streaming
Expand All @@ -2405,8 +2405,8 @@ Server::Server(
"ModelStreamInferHandler", tritonserver_, trace_manager_, shm_manager_,
&service_, model_stream_infer_cq_.get(),
options.infer_allocation_pool_size_ /* max_state_bucket_count */,
options.infer_compression_level_, restricted_kv,
options.forward_header_pattern_));
options.max_response_pool_size_, options.infer_compression_level_,
restricted_kv, options.forward_header_pattern_));
}

Server::~Server()
Expand Down Expand Up @@ -2472,6 +2472,8 @@ Server::GetOptions(Options& options, UnorderedMapType& options_map)
RETURN_IF_ERR(GetValue(
options_map, "infer_allocation_pool_size",
&options.infer_allocation_pool_size_));
RETURN_IF_ERR(GetValue(
options_map, "max_response_pool_size", &options.max_response_pool_size_));
RETURN_IF_ERR(GetValue(
options_map, "forward_header_pattern", &options.forward_header_pattern_));

Expand Down
3 changes: 2 additions & 1 deletion src/grpc/grpc_server.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -89,6 +89,7 @@ struct Options {
// requests doesn't exceed this value there will be no
// allocation/deallocation of request/response objects.
int infer_allocation_pool_size_{8};
int max_response_pool_size_{INT_MAX};
RestrictedFeatures restricted_protocols_;
std::string forward_header_pattern_;
};
Expand Down
Loading
Loading