Skip to content

Commit

Permalink
Gptj metric scaling (#1699)
Browse files Browse the repository at this point in the history
* Enable inferring tokens per second for gptj

* Add inferred token latencies demos

* Use sample_count - 1 to infer completed tps

* Update mlperf.conf
  • Loading branch information
pgmpablo157321 authored May 21, 2024
1 parent 8fcd0d0 commit fb65dde
Show file tree
Hide file tree
Showing 8 changed files with 307 additions and 7 deletions.
2 changes: 2 additions & 0 deletions loadgen/bindings/python_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
.def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
.def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
.def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
.def_readwrite("infer_token_latencies", &TestSettings::infer_token_latencies)
.def_readwrite("token_latency_scaling_factor", &TestSettings::token_latency_scaling_factor)
.def("FromConfig", &TestSettings::FromConfig, "FromConfig.");

pybind11::enum_<LoggingMode>(m, "LoggingMode")
Expand Down
119 changes: 119 additions & 0 deletions loadgen/demos/token_metrics/py_demo_offline_inferred.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Copyright 2019 The MLPerf Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

"""Python demo showing how to use the MLPerf Inference load generator bindings.
"""

from __future__ import print_function

import argparse
import threading
import time
import numpy as np
import array

import mlperf_loadgen


def f(x, y):
return (4 + 3*x*y + x**3 + y**2)

def create_responses(n, m, mod = 4):
r = []
for i in range(n):
r.append([f(i,j) for j in range(m + (i%mod))])
return r
responses = create_responses(1024, 20, mod = 3)

def load_samples_to_ram(query_samples):
del query_samples
return


def unload_samples_from_ram(query_samples):
del query_samples
return


# Processes queries in 3 slices that complete at different times.
def process_query_async(query_samples, i_slice):
time.sleep(3 * (i_slice + 1))
query_responses = []
samples_to_complete = query_samples[i_slice:len(query_samples):3]
for s in samples_to_complete:
response_array = np.array(responses[s.index], np.int32)
token = response_array[0]
time.sleep(.0002)
response_token = array.array("B", token.tobytes())
response_token_info = response_token.buffer_info()
response_token_data = response_token_info[0]
response_token_size = response_token_info[1] * response_token.itemsize
# mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
time.sleep(.02)
n_tokens = len(response_array)
response_array = array.array("B", response_array.tobytes())
response_info = response_array.buffer_info()
response_data = response_info[0]
response_size = response_info[1] * response_array.itemsize
query_responses.append(
mlperf_loadgen.QuerySampleResponse(
s.id, response_data, response_size))
mlperf_loadgen.QuerySamplesComplete(query_responses)


def issue_query(query_samples):
threading.Thread(target=process_query_async,
args=(query_samples, 0)).start()
threading.Thread(target=process_query_async,
args=(query_samples, 1)).start()
threading.Thread(target=process_query_async,
args=(query_samples, 2)).start()


def flush_queries():
pass


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance")
parser.add_argument("--expected-qps", type=int, default=1000)
parser.add_argument("--min-duration-ms", type=int, default=30000)
return parser.parse_args()


def main():
args = get_args()
settings = mlperf_loadgen.TestSettings()
settings.scenario = mlperf_loadgen.TestScenario.Offline
if args.mode == "performance":
settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
else:
settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
settings.offline_expected_qps = args.expected_qps
settings.min_duration_ms = args.min_duration_ms
settings.infer_token_latencies = 1
settings.token_latency_scaling_factor = 21

sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
qsl = mlperf_loadgen.ConstructQSL(
1024, 128, load_samples_to_ram, unload_samples_from_ram)
mlperf_loadgen.StartTest(sut, qsl, settings)
mlperf_loadgen.DestroyQSL(qsl)
mlperf_loadgen.DestroySUT(sut)


if __name__ == "__main__":
main()
116 changes: 116 additions & 0 deletions loadgen/demos/token_metrics/py_demo_server_inferred.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Copyright 2019 The MLPerf Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

"""Python demo showing how to use the MLPerf Inference load generator bindings.
"""

from __future__ import print_function

import argparse
import array
import threading
import time
import numpy as np

from absl import app
import mlperf_loadgen

def f(x, y):
return (4 + 3*x*y + x**3 + y**2)

def create_responses(n, m, mod = 4):
r = []
for i in range(n):
r.append([f(i,j) for j in range(m + (i%mod))])
return r
responses = create_responses(1024, 20, mod = 3)

def load_samples_to_ram(query_samples):
del query_samples
return


def unload_samples_from_ram(query_samples):
del query_samples
return


def process_query_async(query_samples):
"""Processes the list of queries."""
query_responses = []
for s in query_samples:
response_array = np.array(responses[s.index], np.int32)
token = response_array[0]
time.sleep(.0002)
response_token = array.array("B", token.tobytes())
response_token_info = response_token.buffer_info()
response_token_data = response_token_info[0]
response_token_size = response_token_info[1] * response_token.itemsize
time.sleep(.02)
n_tokens = len(response_array)
response_array = array.array("B", response_array.tobytes())
response_info = response_array.buffer_info()
response_data = response_info[0]
response_size = response_info[1] * response_array.itemsize
# print(f"Reported size python: {n_tokens}")
query_responses.append(
mlperf_loadgen.QuerySampleResponse(
s.id, response_data, response_size))
mlperf_loadgen.QuerySamplesComplete(query_responses)


def issue_query(query_samples):
threading.Thread(target=process_query_async,
args=[query_samples]).start()


def flush_queries():
pass

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance")
parser.add_argument("--target-qps", type=int, default=100)
parser.add_argument("--target-latency-ns", type=int, default=100000000)
parser.add_argument("--min-query-count", type=int, default=100)
parser.add_argument("--min-duration-ms", type=int, default=30000)
return parser.parse_args()

def main():
args = get_args()
settings = mlperf_loadgen.TestSettings()
settings.scenario = mlperf_loadgen.TestScenario.Server
if args.mode == "performance":
settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
else:
settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
settings.server_target_qps = args.target_qps
settings.server_target_latency_ns = args.target_latency_ns
settings.min_query_count = args.min_query_count
settings.min_duration_ms = args.min_duration_ms
settings.infer_token_latencies = 1
settings.token_latency_scaling_factor = 21

sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
qsl = mlperf_loadgen.ConstructQSL(
1024, 128, load_samples_to_ram, unload_samples_from_ram)
mlperf_loadgen.StartTest(sut, qsl, settings)
mlperf_loadgen.DestroyQSL(qsl)
mlperf_loadgen.DestroySUT(sut)


if __name__ == "__main__":
main()

48 changes: 47 additions & 1 deletion loadgen/results.cc
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,32 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
break;
}
case TestScenario::Server:
double tps_as_completed =
token_count / pr.final_query_all_samples_done_time;
summary("Completed tokens per second: ",
DoubleToString(tps_as_completed));
break;
}
}

if (settings.infer_token_latencies){
switch (settings.scenario) {
case TestScenario::SingleStream: {
break;
}
case TestScenario::MultiStream: {
break;
}
case TestScenario::Offline: {
double tokens_per_second = settings.token_latency_scaling_factor * sample_count / pr.max_latency;
summary("Tokens per second (inferred): ", tokens_per_second);
break;
}
case TestScenario::Server:
double tps_as_completed =
settings.token_latency_scaling_factor * (sample_count - 1) / pr.final_query_all_samples_done_time;
summary("Completed tokens per second (inferred): ",
DoubleToString(tps_as_completed));
break;
}
}
Expand Down Expand Up @@ -777,8 +803,28 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
}
}

if (settings.infer_token_latencies){
switch (settings.scenario) {
case TestScenario::Server: {
double completed_tokens_per_second = (sample_count - 1) * settings.token_latency_scaling_factor / pr.final_query_all_samples_done_time;
MLPERF_LOG(detail, "result_inferred_completed_tokens_per_second", completed_tokens_per_second);
break;
}
case TestScenario::Offline: {
double tokens_per_second = sample_count * settings.token_latency_scaling_factor / pr.max_latency;
MLPERF_LOG(detail, "result_inferred_tokens_per_second", tokens_per_second);
break;
}
case TestScenario::SingleStream: {
break;
}
case TestScenario::MultiStream: {
break;
}
}
#endif
}

}
} // namespace loadgen
} // namespace mlperf
3 changes: 3 additions & 0 deletions loadgen/test_settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,9 @@ struct TestSettings {
/// Token latency parameters
uint64_t server_ttft_latency = 100000000;
uint64_t server_tpot_latency = 100000000;
/// \brief Infer token latencies
bool infer_token_latencies = false;
uint64_t token_latency_scaling_factor;
/**@}*/
};

Expand Down
13 changes: 11 additions & 2 deletions loadgen/test_settings_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ TestSettingsInternal::TestSettingsInternal(
sample_concatenate_permutation(false),
use_token_latencies(requested.use_token_latencies),
server_ttft_latency(requested.server_ttft_latency),
server_tpot_latency(requested.server_tpot_latency){
server_tpot_latency(requested.server_tpot_latency),
infer_token_latencies(requested.infer_token_latencies),
token_latency_scaling_factor(requested.token_latency_scaling_factor){
// Target QPS, target latency, and max_async_queries.
switch (requested.scenario) {
case TestScenario::SingleStream:
Expand Down Expand Up @@ -689,7 +691,7 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
nullptr);
lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr);

// keys that apply to token metrics
// keys to measure token metrics
if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)){
use_token_latencies = (val == 1) ? true : false;
if (use_token_latencies){
Expand All @@ -698,6 +700,13 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
}
}

// keys to infer token metrics
if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)){
infer_token_latencies = (val == 1) ? true : false;
if (infer_token_latencies){
lookupkv(model, scenario, "token_latency_scaling_factor", &token_latency_scaling_factor, nullptr, 1);
}
}
// keys that apply to SingleStream
lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
&single_stream_target_latency_percentile, 0.01);
Expand Down
3 changes: 3 additions & 0 deletions loadgen/test_settings_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ struct TestSettingsInternal {
bool use_token_latencies = false;
int64_t server_ttft_latency;
int64_t server_tpot_latency;

bool infer_token_latencies = false;
int64_t token_latency_scaling_factor;
};

/// \brief A namespace of collections of FindPeakPerformance helper functions,
Expand Down
10 changes: 6 additions & 4 deletions mlperf.conf
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@ retinanet.MultiStream.target_latency = 528
3d-unet.*.sample_concatenate_permutation = 1

# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
gptj.Server.sample_concatenate_permutation = 1
gptj.SingleStream.sample_concatenate_permutation = 1
llama2-70b.Server.sample_concatenate_permutation = 1
gptj.*.sample_concatenate_permutation = 1
llama2-70b.*.sample_concatenate_permutation = 1

*.Server.target_latency = 10
*.Server.target_latency_percentile = 99
Expand All @@ -57,8 +56,11 @@ dlrm-v2.Server.target_latency = 60
rnnt.Server.target_latency = 1000
gptj.Server.target_latency = 20000
stable-diffusion-xl.Server.target_latency = 20000
# Falcon Server scenario requires two latency constraints
# Llama2-70b benchmarks measures token latencies
llama2-70b.*.use_token_latencies = 1
# gptj benchmark infers token latencies
gptj.*.infer_token_latencies = 1
gptj.*.token_latency_scaling_factor = 69
# Only ttft and tpot are tracked for the llama2-70b benchmark therefore target_latency = 0
llama2-70b.Server.target_latency = 0
llama2-70b.Server.ttft_latency = 2000
Expand Down

0 comments on commit fb65dde

Please sign in to comment.