From 478547763795ab430875f9a77c5c69500da49591 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 8 Aug 2024 04:52:59 +0000 Subject: [PATCH 1/8] fix executor --- superbench/executor/executor.py | 42 ++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index c4a812a9c..224981348 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -228,29 +228,16 @@ def exec(self): logger.warning('Monitor can not support CPU platform.') benchmark_real_name = benchmark_name.split(':')[0] - if 'frameworks' in benchmark_config: - for framework in benchmark_config.frameworks or [Framework.NONE.value]: - if benchmark_real_name == 'model-benchmarks' or ( - ':' not in benchmark_name and benchmark_name.endswith('_models') - ): - for model in benchmark_config.models: - full_name = f'{benchmark_name}/{framework}-{model}' - logger.info('Executor is going to execute %s.', full_name) - context = BenchmarkRegistry.create_benchmark_context( - model, - platform=self.__get_platform(), - framework=Framework(framework.lower()), - parameters=self.__get_arguments( - {} if 'parameters' not in benchmark_config else benchmark_config.parameters - ) - ) - result = self.__exec_benchmark(full_name, context) - benchmark_results.append(result) - else: - full_name = benchmark_name + frameworks = benchmark_config.get('frameworks', [Framework.NONE.value]) + for framework in frameworks: + if benchmark_real_name == 'model-benchmarks' or ( + ':' not in benchmark_name and benchmark_name.endswith('_models') + ): + for model in benchmark_config.models: + full_name = f'{benchmark_name}/{framework}-{model}' logger.info('Executor is going to execute %s.', full_name) context = BenchmarkRegistry.create_benchmark_context( - benchmark_real_name, + model, platform=self.__get_platform(), framework=Framework(framework.lower()), parameters=self.__get_arguments( @@ -259,6 +246,19 @@ def exec(self): ) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) + else: + full_name = benchmark_name + logger.info('Executor is going to execute %s.', full_name) + context = BenchmarkRegistry.create_benchmark_context( + benchmark_real_name, + platform=self.__get_platform(), + framework=Framework(framework.lower()), + parameters=self.__get_arguments( + {} if 'parameters' not in benchmark_config else benchmark_config.parameters + ) + ) + result = self.__exec_benchmark(full_name, context) + benchmark_results.append(result) if monitor: monitor.stop() From dcaed1dac01af4ffce80fd9b332df113dd1b5617 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 8 Aug 2024 06:43:33 +0000 Subject: [PATCH 2/8] add test case --- tests/executor/test_executor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/executor/test_executor.py b/tests/executor/test_executor.py index 984f0437e..2c287eb37 100644 --- a/tests/executor/test_executor.py +++ b/tests/executor/test_executor.py @@ -166,5 +166,7 @@ def test_exec_default_benchmarks(self, mock_launch_benchmark): self.assertTrue(p.is_dir()) self.assertTrue((p / 'results.json').is_file()) with (p / 'results.json').open() as f: - for result in json.load(f): + results = json.load(f) + self.assertTrue(len(results) > 0) + for result in results: self.assertIn(benchmark_name, result['name']) From ca1b52d20e14dc21828aea761b0aaf28860f8a0f Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 8 Aug 2024 08:30:08 +0000 Subject: [PATCH 3/8] fix --- superbench/runner/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index cd0c8c4dc..84d18035b 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -84,7 +84,7 @@ def __validate_sb_config(self): # noqa: C901 if 'proc_num' not in mode: self._sb_benchmarks[name].modes[idx].proc_num = 8 elif mode.name == 'mpi': - if 'machinefile' not in mode: + if 'mca' not in mode: self._sb_benchmarks[name].modes[idx].mca = { 'pml': 'ob1', 'btl': '^openib', From 7d9ca835aa8f238503550f6790850b18544a6ca3 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 8 Aug 2024 08:50:09 +0000 Subject: [PATCH 4/8] fix format --- superbench/executor/executor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index 224981348..a4c007e31 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -253,9 +253,8 @@ def exec(self): benchmark_real_name, platform=self.__get_platform(), framework=Framework(framework.lower()), - parameters=self.__get_arguments( - {} if 'parameters' not in benchmark_config else benchmark_config.parameters - ) + parameters=self. + __get_arguments({} if 'parameters' not in benchmark_config else benchmark_config.parameters) ) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) From 667095ffc6c1464c576f41ed3aeb6b8a7205c4a6 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 13 Aug 2024 05:57:51 +0000 Subject: [PATCH 5/8] fix --- superbench/runner/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 84d18035b..6eda8cb68 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -448,7 +448,7 @@ def _run_proc(self, benchmark_name, mode, vars): mode.env.update({'SB_MODE_SERIAL_INDEX': mode.serial_index, 'SB_MODE_PARALLEL_INDEX': mode.parallel_index}) logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank) - timeout = self._sb_benchmarks[benchmark_name].get('timeout', 60) + timeout = self._sb_benchmarks[benchmark_name].get('timeout', None) if isinstance(timeout, int): timeout = max(timeout, 60) From cfccc1dfe3377ed786a3e3fd7eadc697c53a1139 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 13 Aug 2024 10:18:53 +0000 Subject: [PATCH 6/8] add test cases --- tests/runner/test_runner.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/runner/test_runner.py b/tests/runner/test_runner.py index 250942267..82ee21dda 100644 --- a/tests/runner/test_runner.py +++ b/tests/runner/test_runner.py @@ -41,6 +41,22 @@ def test_set_logger(self): expected_log_file = Path(self.runner._sb_output_dir) / 'sb-run.log' self.assertTrue(expected_log_file.is_file()) + def test_validate_sb_config(self): + """Test validate_sb_config.""" + self.runner._SuperBenchRunner__validate_sb_config() + self.assertIn('env', self.runner._sb_config.superbench) + for name in self.runner._sb_benchmarks: + self.assertIn('modes', self.runner._sb_config.superbench.benchmarks[name]) + for mode in self.runner._sb_config.superbench.benchmarks[name].modes: + self.assertIn('env', mode) + if mode.name == 'local': + self.assertIn('proc_num', mode) + self.assertIn('prefix', mode) + if mode.name == 'torch.distributed': + self.assertIn('proc_num', mode) + if mode.name == 'mpi': + self.assertIn('mca', mode) + def test_get_failure_count(self): """Test get_failure_count.""" self.assertEqual(0, self.runner.get_failure_count()) @@ -410,3 +426,24 @@ def test_generate_metric_name(self): test_case['run_count'], test_case['curr_rank'], test_case['curr_run'] ), test_case['expected'] ) + + def test_run_proc_timeout(self): + "Test run_proc_timeout." + self.runner._sb_benchmarks = { + 'benchmark1': {'timeout': 120}, + 'benchmark2': {'timeout': None}, + 'benchmark3': {'timeout': 30}, + } + + test_cases = [ + ('benchmark1', 120), + ('benchmark2', None), + ('benchmark3', 60), + ] + + for benchmark_name, expected_timeout in test_cases: + with self.subTest(benchmark_name=benchmark_name): + timeout = self.runner._sb_benchmarks[benchmark_name].get('timeout', None) + if isinstance(timeout, int): + timeout = max(timeout, 60) + self.assertEqual(timeout, expected_timeout) From 5927e0fc81ac9926bf5acaf0ad9c9e625f4b5b6d Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 14 Aug 2024 03:17:05 +0000 Subject: [PATCH 7/8] pass lint --- tests/runner/test_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/runner/test_runner.py b/tests/runner/test_runner.py index 82ee21dda..2c34c7324 100644 --- a/tests/runner/test_runner.py +++ b/tests/runner/test_runner.py @@ -428,7 +428,7 @@ def test_generate_metric_name(self): ) def test_run_proc_timeout(self): - "Test run_proc_timeout." + """Test run_proc_timeout.""" self.runner._sb_benchmarks = { 'benchmark1': {'timeout': 120}, 'benchmark2': {'timeout': None}, From 48a89309c5c0a5bfb966a2b5999b8db93a7b07f8 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 14 Aug 2024 03:22:25 +0000 Subject: [PATCH 8/8] fix lint --- tests/runner/test_runner.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/runner/test_runner.py b/tests/runner/test_runner.py index 2c34c7324..fd45ae0a8 100644 --- a/tests/runner/test_runner.py +++ b/tests/runner/test_runner.py @@ -428,11 +428,17 @@ def test_generate_metric_name(self): ) def test_run_proc_timeout(self): - """Test run_proc_timeout.""" + """Test run_proc_ timeout.""" self.runner._sb_benchmarks = { - 'benchmark1': {'timeout': 120}, - 'benchmark2': {'timeout': None}, - 'benchmark3': {'timeout': 30}, + 'benchmark1': { + 'timeout': 120 + }, + 'benchmark2': { + 'timeout': None + }, + 'benchmark3': { + 'timeout': 30 + }, } test_cases = [