diff --git a/test/performance/parallel_scenarios.cpp b/test/performance/parallel_scenarios.cpp index 09ce439c828f..d354c252a50b 100644 --- a/test/performance/parallel_scenarios.cpp +++ b/test/performance/parallel_scenarios.cpp @@ -31,31 +31,36 @@ int main(int argc, char **argv) { int native_threads = Halide::Internal::JITSharedRuntime::get_num_threads(); + std::map, std::vector> results; + auto bench = [&](bool m, bool c, int i, int o) { - const int num_samples = 128; const int memory_limit = m ? max_memory : 128; + auto now = std::chrono::high_resolution_clock::now; + auto to_ns = [](auto delta) { return 1e9 * std::chrono::duration(delta).count(); }; + auto bench_one = [&]() { - auto t1 = std::chrono::high_resolution_clock::now(); + auto t1 = now(); callable(i, o, memory_limit, in, out); - auto t2 = std::chrono::high_resolution_clock::now(); - return 1e9 * std::chrono::duration(t2 - t1).count() / (i * o); + auto t2 = now(); + return to_ns(t2 - t1) / (i * o); }; - std::vector times(num_samples); + const int num_tasks = 8; + const int min_samples = 32; + + std::vector times[num_tasks]; if (c) { Halide::Tools::ThreadPool thread_pool; - const int num_tasks = 8; - const int samples_per_task = num_samples / num_tasks; Halide::Internal::JITSharedRuntime::set_num_threads(num_tasks * native_threads); std::vector> futures(num_tasks); for (size_t t = 0; t < futures.size(); t++) { futures[t] = thread_pool.async( [&](size_t t) { bench_one(); - for (int s = 0; s < samples_per_task; s++) { - size_t idx = t * samples_per_task + s; - times[idx] = bench_one(); + auto t_start = now(); + while (to_ns(now() - t_start) < 1e7 || times[t].size() < min_samples / num_tasks) { + times[t].push_back(bench_one()); } }, t); @@ -66,32 +71,43 @@ int main(int argc, char **argv) { } else { Halide::Internal::JITSharedRuntime::set_num_threads(native_threads); bench_one(); - for (int s = 0; s < num_samples; s++) { - times[s] = bench_one(); + auto t_start = now(); + while (to_ns(now() - t_start) < 1e7 || times[0].size() < min_samples) { + times[0].push_back(bench_one()); } } - std::sort(times.begin(), times.end()); - printf("%d %d %d %d ", m, c, i, o); - const int n = 8; - int off = (num_samples / n) / 2; - for (int i = 0; i < n; i++) { - printf("%g ", times[off + (num_samples * i) / n]); + + std::vector &r = results[{m, c, i, o}]; + for (int i = 0; i < num_tasks; i++) { + r.insert(r.end(), times[i].begin(), times[i].end()); } - printf("\n"); }; // The output is designed to be copy-pasted into a spreadsheet, not read by a human - printf("memory_bound contended inner outer t0 t1 t2 t3 t4 t5 t7\n"); - for (bool contended : {false, true}) { - for (bool memory_bound : {false, true}) { - for (int i : {1 << 0, 1 << 6, 1 << 12, 1 << 18}) { - for (int o : {1, 2, 4, 8, 16, 32, 64, 128, 256}) { - bench(memory_bound, contended, i, o); + printf("memory_bound contended inner outer num_samples 10%% 20%% 30%% 40%% 50%% 60%% 70%% 80%% 90%%\n"); + for (int repeat = 0; repeat < 10; repeat++) { + for (bool contended : {false, true}) { + for (bool memory_bound : {false, true}) { + for (int i : {1 << 6, 1 << 9, 1 << 12, 1 << 15}) { + for (int o : {1, 2, 4, 8, 16, 32, 64, 128, 256}) { + bench(memory_bound, contended, i, o); + } } } } } + for (auto p : results) { + auto × = p.second; + std::sort(times.begin(), times.end()); + auto [m, c, i, o] = p.first; + printf("%d %d %d %d %d ", m, c, i, o, (int)times.size()); + for (int decile = 10; decile <= 90; decile += 10) { + printf("%g ", times[(decile * times.size()) / 100]); + } + printf("\n"); + } + printf("Success!\n"); return 0;