From fb65dde110562aefb4d5906a6a9e5869f469834b Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Tue, 21 May 2024 15:20:05 -0500 Subject: [PATCH] Gptj metric scaling (#1699) * Enable inferring tokens per second for gptj * Add inferred token latencies demos * Use sample_count - 1 to infer completed tps * Update mlperf.conf --- loadgen/bindings/python_api.cc | 2 + .../token_metrics/py_demo_offline_inferred.py | 119 ++++++++++++++++++ .../token_metrics/py_demo_server_inferred.py | 116 +++++++++++++++++ loadgen/results.cc | 48 ++++++- loadgen/test_settings.h | 3 + loadgen/test_settings_internal.cc | 13 +- loadgen/test_settings_internal.h | 3 + mlperf.conf | 10 +- 8 files changed, 307 insertions(+), 7 deletions(-) create mode 100644 loadgen/demos/token_metrics/py_demo_offline_inferred.py create mode 100644 loadgen/demos/token_metrics/py_demo_server_inferred.py diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index cfe24bd3c..83e14e0ec 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -338,6 +338,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) { .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies) .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency) .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency) + .def_readwrite("infer_token_latencies", &TestSettings::infer_token_latencies) + .def_readwrite("token_latency_scaling_factor", &TestSettings::token_latency_scaling_factor) .def("FromConfig", &TestSettings::FromConfig, "FromConfig."); pybind11::enum_(m, "LoggingMode") diff --git a/loadgen/demos/token_metrics/py_demo_offline_inferred.py b/loadgen/demos/token_metrics/py_demo_offline_inferred.py new file mode 100644 index 000000000..79390e1b8 --- /dev/null +++ b/loadgen/demos/token_metrics/py_demo_offline_inferred.py @@ -0,0 +1,119 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import argparse +import threading +import time +import numpy as np +import array + +import mlperf_loadgen + + +def f(x, y): + return (4 + 3*x*y + x**3 + y**2) + +def create_responses(n, m, mod = 4): + r = [] + for i in range(n): + r.append([f(i,j) for j in range(m + (i%mod))]) + return r +responses = create_responses(1024, 20, mod = 3) + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in 3 slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(3 * (i_slice + 1)) + query_responses = [] + samples_to_complete = query_samples[i_slice:len(query_samples):3] + for s in samples_to_complete: + response_array = np.array(responses[s.index], np.int32) + token = response_array[0] + time.sleep(.0002) + response_token = array.array("B", token.tobytes()) + response_token_info = response_token.buffer_info() + response_token_data = response_token_info[0] + response_token_size = response_token_info[1] * response_token.itemsize + # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)]) + time.sleep(.02) + n_tokens = len(response_array) + response_array = array.array("B", response_array.tobytes()) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + query_responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size)) + mlperf_loadgen.QuerySamplesComplete(query_responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, + args=(query_samples, 0)).start() + threading.Thread(target=process_query_async, + args=(query_samples, 1)).start() + threading.Thread(target=process_query_async, + args=(query_samples, 2)).start() + + +def flush_queries(): + pass + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance") + parser.add_argument("--expected-qps", type=int, default=1000) + parser.add_argument("--min-duration-ms", type=int, default=30000) + return parser.parse_args() + + +def main(): + args = get_args() + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Offline + if args.mode == "performance": + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + else: + settings.mode = mlperf_loadgen.TestMode.AccuracyOnly + settings.offline_expected_qps = args.expected_qps + settings.min_duration_ms = args.min_duration_ms + settings.infer_token_latencies = 1 + settings.token_latency_scaling_factor = 21 + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + main() diff --git a/loadgen/demos/token_metrics/py_demo_server_inferred.py b/loadgen/demos/token_metrics/py_demo_server_inferred.py new file mode 100644 index 000000000..b4431ec9c --- /dev/null +++ b/loadgen/demos/token_metrics/py_demo_server_inferred.py @@ -0,0 +1,116 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import argparse +import array +import threading +import time +import numpy as np + +from absl import app +import mlperf_loadgen + +def f(x, y): + return (4 + 3*x*y + x**3 + y**2) + +def create_responses(n, m, mod = 4): + r = [] + for i in range(n): + r.append([f(i,j) for j in range(m + (i%mod))]) + return r +responses = create_responses(1024, 20, mod = 3) + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + """Processes the list of queries.""" + query_responses = [] + for s in query_samples: + response_array = np.array(responses[s.index], np.int32) + token = response_array[0] + time.sleep(.0002) + response_token = array.array("B", token.tobytes()) + response_token_info = response_token.buffer_info() + response_token_data = response_token_info[0] + response_token_size = response_token_info[1] * response_token.itemsize + time.sleep(.02) + n_tokens = len(response_array) + response_array = array.array("B", response_array.tobytes()) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + # print(f"Reported size python: {n_tokens}") + query_responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size)) + mlperf_loadgen.QuerySamplesComplete(query_responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, + args=[query_samples]).start() + + +def flush_queries(): + pass + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance") + parser.add_argument("--target-qps", type=int, default=100) + parser.add_argument("--target-latency-ns", type=int, default=100000000) + parser.add_argument("--min-query-count", type=int, default=100) + parser.add_argument("--min-duration-ms", type=int, default=30000) + return parser.parse_args() + +def main(): + args = get_args() + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Server + if args.mode == "performance": + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + else: + settings.mode = mlperf_loadgen.TestMode.AccuracyOnly + settings.server_target_qps = args.target_qps + settings.server_target_latency_ns = args.target_latency_ns + settings.min_query_count = args.min_query_count + settings.min_duration_ms = args.min_duration_ms + settings.infer_token_latencies = 1 + settings.token_latency_scaling_factor = 21 + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + main() + diff --git a/loadgen/results.cc b/loadgen/results.cc index 445de8901..00ea66558 100644 --- a/loadgen/results.cc +++ b/loadgen/results.cc @@ -440,6 +440,32 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) { break; } case TestScenario::Server: + double tps_as_completed = + token_count / pr.final_query_all_samples_done_time; + summary("Completed tokens per second: ", + DoubleToString(tps_as_completed)); + break; + } + } + + if (settings.infer_token_latencies){ + switch (settings.scenario) { + case TestScenario::SingleStream: { + break; + } + case TestScenario::MultiStream: { + break; + } + case TestScenario::Offline: { + double tokens_per_second = settings.token_latency_scaling_factor * sample_count / pr.max_latency; + summary("Tokens per second (inferred): ", tokens_per_second); + break; + } + case TestScenario::Server: + double tps_as_completed = + settings.token_latency_scaling_factor * (sample_count - 1) / pr.final_query_all_samples_done_time; + summary("Completed tokens per second (inferred): ", + DoubleToString(tps_as_completed)); break; } } @@ -777,8 +803,28 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) { MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second); } } + + if (settings.infer_token_latencies){ + switch (settings.scenario) { + case TestScenario::Server: { + double completed_tokens_per_second = (sample_count - 1) * settings.token_latency_scaling_factor / pr.final_query_all_samples_done_time; + MLPERF_LOG(detail, "result_inferred_completed_tokens_per_second", completed_tokens_per_second); + break; + } + case TestScenario::Offline: { + double tokens_per_second = sample_count * settings.token_latency_scaling_factor / pr.max_latency; + MLPERF_LOG(detail, "result_inferred_tokens_per_second", tokens_per_second); + break; + } + case TestScenario::SingleStream: { + break; + } + case TestScenario::MultiStream: { + break; + } + } #endif } - +} } // namespace loadgen } // namespace mlperf diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h index b0018380d..8b209035c 100644 --- a/loadgen/test_settings.h +++ b/loadgen/test_settings.h @@ -267,6 +267,9 @@ struct TestSettings { /// Token latency parameters uint64_t server_ttft_latency = 100000000; uint64_t server_tpot_latency = 100000000; + /// \brief Infer token latencies + bool infer_token_latencies = false; + uint64_t token_latency_scaling_factor; /**@}*/ }; diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index 2bcf62c29..5a18c32f9 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -51,7 +51,9 @@ TestSettingsInternal::TestSettingsInternal( sample_concatenate_permutation(false), use_token_latencies(requested.use_token_latencies), server_ttft_latency(requested.server_ttft_latency), - server_tpot_latency(requested.server_tpot_latency){ + server_tpot_latency(requested.server_tpot_latency), + infer_token_latencies(requested.infer_token_latencies), + token_latency_scaling_factor(requested.token_latency_scaling_factor){ // Target QPS, target latency, and max_async_queries. switch (requested.scenario) { case TestScenario::SingleStream: @@ -689,7 +691,7 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, nullptr); lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr); - // keys that apply to token metrics + // keys to measure token metrics if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)){ use_token_latencies = (val == 1) ? true : false; if (use_token_latencies){ @@ -698,6 +700,13 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, } } + // keys to infer token metrics + if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)){ + infer_token_latencies = (val == 1) ? true : false; + if (infer_token_latencies){ + lookupkv(model, scenario, "token_latency_scaling_factor", &token_latency_scaling_factor, nullptr, 1); + } + } // keys that apply to SingleStream lookupkv(model, "SingleStream", "target_latency_percentile", nullptr, &single_stream_target_latency_percentile, 0.01); diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h index 5222f3156..d557e9706 100644 --- a/loadgen/test_settings_internal.h +++ b/loadgen/test_settings_internal.h @@ -85,6 +85,9 @@ struct TestSettingsInternal { bool use_token_latencies = false; int64_t server_ttft_latency; int64_t server_tpot_latency; + + bool infer_token_latencies = false; + int64_t token_latency_scaling_factor; }; /// \brief A namespace of collections of FindPeakPerformance helper functions, diff --git a/mlperf.conf b/mlperf.conf index 7e286b565..dd835563d 100644 --- a/mlperf.conf +++ b/mlperf.conf @@ -41,9 +41,8 @@ retinanet.MultiStream.target_latency = 528 3d-unet.*.sample_concatenate_permutation = 1 # LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario -gptj.Server.sample_concatenate_permutation = 1 -gptj.SingleStream.sample_concatenate_permutation = 1 -llama2-70b.Server.sample_concatenate_permutation = 1 +gptj.*.sample_concatenate_permutation = 1 +llama2-70b.*.sample_concatenate_permutation = 1 *.Server.target_latency = 10 *.Server.target_latency_percentile = 99 @@ -57,8 +56,11 @@ dlrm-v2.Server.target_latency = 60 rnnt.Server.target_latency = 1000 gptj.Server.target_latency = 20000 stable-diffusion-xl.Server.target_latency = 20000 -# Falcon Server scenario requires two latency constraints +# Llama2-70b benchmarks measures token latencies llama2-70b.*.use_token_latencies = 1 +# gptj benchmark infers token latencies +gptj.*.infer_token_latencies = 1 +gptj.*.token_latency_scaling_factor = 69 # Only ttft and tpot are tracked for the llama2-70b benchmark therefore target_latency = 0 llama2-70b.Server.target_latency = 0 llama2-70b.Server.ttft_latency = 2000