Skip to content

Commit

Permalink
Support for fixed number of requests (#633)
Browse files Browse the repository at this point in the history
* first pass. Hardcoded values

* Working for concurrency (hardcoded whenever count windows is used for now)

* working for req rate as well

* Add CLI. Add/fix unit tests

* Remove hack. Restore all normal functionality

* Refactor thread config into one class. Add more testing

* Rename arg to request-count

* Fix request rate bug

* Update info print

* fix corner case

* move fixme to a story tag

* add assert to avoid corner case

* rename variables

* self review #1

* copyright changes

* add doxygen to functions

* Don't allow sweeping over multiple concurrency or request rate with request-count
  • Loading branch information
tgerdesnv authored May 9, 2024
1 parent 6609ecd commit 54ec281
Show file tree
Hide file tree
Showing 24 changed files with 516 additions and 189 deletions.
1 change: 1 addition & 0 deletions c++/perf_analyzer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ set(
profile_data_exporter.h
periodic_concurrency_manager.h
periodic_concurrency_worker.h
thread_config.h
)

add_executable(
Expand Down
2 changes: 2 additions & 0 deletions c++/perf_analyzer/client_backend/client_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ enum BackendKind {
TRITON_C_API = 3,
OPENAI = 4
};
std::string BackendKindToString(const BackendKind kind);

enum ProtocolType { HTTP = 0, GRPC = 1, UNKNOWN = 2 };
enum GrpcCompressionAlgorithm {
COMPRESS_NONE = 0,
Expand Down
49 changes: 49 additions & 0 deletions c++/perf_analyzer/command_line_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ CLParser::Usage(const std::string& msg)
"profiling>"
<< std::endl;
std::cerr << "\t--percentile <percentile>" << std::endl;
std::cerr << "\t--request-count <number of requests>" << std::endl;
std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
std::cerr << "\t-t <number of concurrent requests>" << std::endl;
std::cerr << "\t-c <maximum concurrency>" << std::endl;
Expand Down Expand Up @@ -463,6 +464,14 @@ CLParser::Usage(const std::string& msg)
"that the average latency is used to determine stability",
18)
<< std::endl;
std::cerr
<< FormatMessage(
" --request-count: Specifies a total number of requests to "
"use for measurement. The default is 0, which means that there is "
"no request count and the measurement will proceed using windows "
"until stabilization is detected.",
18)
<< std::endl;
std::cerr << FormatMessage(
" --serial-sequences: Enables serial sequence mode "
"where a maximum of one request is outstanding at a time "
Expand Down Expand Up @@ -879,6 +888,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
{"request-period", required_argument, 0, 59},
{"request-parameter", required_argument, 0, 60},
{"endpoint", required_argument, 0, 61},
{"request-count", required_argument, 0, 62},
{0, 0, 0, 0}};

// Parse commandline...
Expand Down Expand Up @@ -1614,6 +1624,13 @@ CLParser::ParseCommandLine(int argc, char** argv)
params_->endpoint = optarg;
break;
}
case 62: {
if (std::stoi(optarg) < 0) {
Usage("Failed to parse --request-count. The value must be > 0.");
}
params_->request_count = std::stoi(optarg);
break;
}
case 'v':
params_->extra_verbose = params_->verbose;
params_->verbose = true;
Expand Down Expand Up @@ -1705,6 +1722,13 @@ CLParser::ParseCommandLine(int argc, char** argv)
// Will be using user-provided time intervals, hence no control variable.
params_->search_mode = SearchMode::NONE;
}

// When the request-count feature is enabled, override the measurement mode to
// be count windows with a window size of the requested count
if (params_->request_count) {
params_->measurement_mode = MeasurementMode::COUNT_WINDOWS;
params_->measurement_request_count = params_->request_count;
}
}

void
Expand Down Expand Up @@ -1874,6 +1898,31 @@ CLParser::VerifyOptions()
"binary search mode.");
}

if (params_->request_count != 0) {
if (params_->using_concurrency_range) {
if (params_->request_count < params_->concurrency_range.start) {
Usage("request-count can not be less than concurrency");
}
if (params_->concurrency_range.start < params_->concurrency_range.end) {
Usage(
"request-count not supported with multiple concurrency values in "
"one run");
}
}
if (params_->using_request_rate_range) {
if (params_->request_count <
static_cast<int>(params_->request_rate_range[0])) {
Usage("request-count can not be less than request-rate");
}
if (params_->request_rate_range[SEARCH_RANGE::kSTART] <
params_->request_rate_range[SEARCH_RANGE::kEND]) {
Usage(
"request-count not supported with multiple request-rate values in "
"one run");
}
}
}

if (params_->kind == cb::TENSORFLOW_SERVING) {
if (params_->protocol != cb::ProtocolType::GRPC) {
Usage(
Expand Down
1 change: 1 addition & 0 deletions c++/perf_analyzer/command_line_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct PerfAnalyzerParameters {
uint64_t latency_threshold_ms = NO_LIMIT;
double stability_threshold = 0.1;
size_t max_trials = 10;
size_t request_count = 0;
bool zero_input = false;
size_t string_length = 128;
std::string string_data;
Expand Down
22 changes: 15 additions & 7 deletions c++/perf_analyzer/concurrency_manager.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -84,10 +84,10 @@ ConcurrencyManager::InitManagerFinalize()

cb::Error
ConcurrencyManager::ChangeConcurrencyLevel(
const size_t concurrent_request_count)
const size_t concurrent_request_count, const size_t request_count)
{
PauseSequenceWorkers();
ReconfigThreads(concurrent_request_count);
ReconfigThreads(concurrent_request_count, request_count);
ResumeSequenceWorkers();

std::cout << "Request concurrency: " << concurrent_request_count << std::endl;
Expand All @@ -109,7 +109,8 @@ ConcurrencyManager::PauseSequenceWorkers()
}

void
ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
ConcurrencyManager::ReconfigThreads(
size_t concurrent_request_count, size_t request_count)
{
// Always prefer to create new threads if the maximum limit has not been met
//
Expand All @@ -121,8 +122,7 @@ ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
(threads_.size() < max_threads_)) {
// Launch new thread for inferencing
threads_stat_.emplace_back(new ThreadStat());
threads_config_.emplace_back(
new ConcurrencyWorker::ThreadConfig(threads_config_.size()));
threads_config_.emplace_back(new ThreadConfig(threads_config_.size()));

workers_.push_back(
MakeWorker(threads_stat_.back(), threads_config_.back()));
Expand All @@ -138,13 +138,21 @@ ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
// and spread the remaining value
size_t avg_concurrency = concurrent_request_count / threads_.size();
size_t threads_add_one = concurrent_request_count % threads_.size();

size_t avg_req_count = request_count / threads_.size();
size_t req_count_add_one = request_count % threads_.size();

size_t seq_stat_index_offset = 0;
active_threads_ = 0;
for (size_t i = 0; i < threads_stat_.size(); i++) {
size_t concurrency = avg_concurrency + (i < threads_add_one ? 1 : 0);

threads_config_[i]->concurrency_ = concurrency;
threads_config_[i]->seq_stat_index_offset_ = seq_stat_index_offset;

size_t thread_num_reqs = avg_req_count + (i < req_count_add_one ? 1 : 0);
threads_config_[i]->num_requests_ = thread_num_reqs;

seq_stat_index_offset += concurrency;

if (concurrency) {
Expand All @@ -171,7 +179,7 @@ ConcurrencyManager::ResumeSequenceWorkers()
std::shared_ptr<IWorker>
ConcurrencyManager::MakeWorker(
std::shared_ptr<ThreadStat> thread_stat,
std::shared_ptr<ConcurrencyWorker::ThreadConfig> thread_config)
std::shared_ptr<ThreadConfig> thread_config)
{
uint32_t id = workers_.size();

Expand Down
14 changes: 8 additions & 6 deletions c++/perf_analyzer/concurrency_manager.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -89,14 +89,16 @@ class ConcurrencyManager : public LoadManager {
/// Adjusts the number of concurrent requests to be the same as
/// 'concurrent_request_count' (by creating or pausing threads)
/// \param concurent_request_count The number of concurrent requests.
/// \param request_count The number of requests to generate. If 0, then
/// there is no limit, and it will generate until told to stop.
/// \return cb::Error object indicating success or failure.
cb::Error ChangeConcurrencyLevel(const size_t concurrent_request_count);
cb::Error ChangeConcurrencyLevel(
const size_t concurrent_request_count, const size_t request_count = 0);

protected:
// Makes a new worker
virtual std::shared_ptr<IWorker> MakeWorker(
std::shared_ptr<ThreadStat>,
std::shared_ptr<ConcurrencyWorker::ThreadConfig>);
std::shared_ptr<ThreadStat>, std::shared_ptr<ThreadConfig>);

ConcurrencyManager(
const bool async, const bool streaming, const int32_t batch_size,
Expand All @@ -114,7 +116,7 @@ class ConcurrencyManager : public LoadManager {

size_t max_concurrency_;

std::vector<std::shared_ptr<ConcurrencyWorker::ThreadConfig>> threads_config_;
std::vector<std::shared_ptr<ThreadConfig>> threads_config_;

private:
void InitManagerFinalize() override;
Expand All @@ -126,7 +128,7 @@ class ConcurrencyManager : public LoadManager {
// Create new threads (if necessary), and then reconfigure all worker threads
// to handle the new concurrent request count
//
void ReconfigThreads(size_t concurrent_request_count);
void ReconfigThreads(size_t concurrent_request_count, size_t request_count);

// Restart all worker threads that were working on sequences
//
Expand Down
37 changes: 7 additions & 30 deletions c++/perf_analyzer/concurrency_worker.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -29,6 +29,7 @@

#include "load_worker.h"
#include "sequence_manager.h"
#include "thread_config.h"

namespace triton { namespace perfanalyzer {

Expand All @@ -49,28 +50,6 @@ class NaggyMockConcurrencyWorker;
///
class ConcurrencyWorker : public LoadWorker {
public:
struct ThreadConfig {
ThreadConfig(
size_t thread_id, size_t concurrency = 0,
size_t seq_stat_index_offset = 0)
: thread_id_(thread_id), concurrency_(concurrency),
seq_stat_index_offset_(seq_stat_index_offset), is_paused_(false)
{
}

// ID of corresponding worker thread
size_t thread_id_;

// The concurrency level that the worker should produce
size_t concurrency_;

// The starting sequence stat index for this worker
size_t seq_stat_index_offset_;

// Whether or not the thread is issuing new inference requests
bool is_paused_;
};

ConcurrencyWorker(
uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
std::shared_ptr<ThreadConfig> thread_config,
Expand All @@ -85,11 +64,11 @@ class ConcurrencyWorker : public LoadWorker {
const std::shared_ptr<IInferDataManager>& infer_data_manager,
std::shared_ptr<SequenceManager> sequence_manager)
: LoadWorker(
id, thread_stat, parser, data_loader, factory, on_sequence_model,
async, streaming, batch_size, using_json_data, wake_signal,
wake_mutex, execute, infer_data_manager, sequence_manager),
thread_config_(thread_config), max_concurrency_(max_concurrency),
active_threads_(active_threads)
id, thread_stat, thread_config, parser, data_loader, factory,
on_sequence_model, async, streaming, batch_size, using_json_data,
wake_signal, wake_mutex, execute, infer_data_manager,
sequence_manager),
max_concurrency_(max_concurrency), active_threads_(active_threads)
{
}

Expand All @@ -109,8 +88,6 @@ class ConcurrencyWorker : public LoadWorker {
// threads?
size_t& active_threads_;

std::shared_ptr<ThreadConfig> thread_config_;

// Handle the case where execute_ is false
void HandleExecuteOff();

Expand Down
6 changes: 3 additions & 3 deletions c++/perf_analyzer/custom_load_manager.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -76,10 +76,10 @@ CustomLoadManager::CustomLoadManager(
}

cb::Error
CustomLoadManager::InitCustomIntervals()
CustomLoadManager::InitCustomIntervals(const size_t request_count)
{
PauseWorkers();
ConfigureThreads();
ConfigureThreads(request_count);
auto status = GenerateSchedule();
ResumeWorkers();
return status;
Expand Down
6 changes: 4 additions & 2 deletions c++/perf_analyzer/custom_load_manager.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -88,8 +88,10 @@ class CustomLoadManager : public RequestRateManager {

/// Initializes the load manager with the provided file containing request
/// intervals
/// \param request_count The number of requests to generate. If 0, then
/// there is no limit, and it will generate until told to stop.
/// \return cb::Error object indicating success or failure.
cb::Error InitCustomIntervals();
cb::Error InitCustomIntervals(const size_t request_count);

/// Computes the request rate from the time interval file. Fails with an error
/// if the file is not present or is empty.
Expand Down
Loading

0 comments on commit 54ec281

Please sign in to comment.