Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug - Fix issues caused by omegaconf version update #636

Merged
merged 11 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 20 additions & 21 deletions superbench/executor/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,29 +228,16 @@ def exec(self):
logger.warning('Monitor can not support CPU platform.')

benchmark_real_name = benchmark_name.split(':')[0]
if 'frameworks' in benchmark_config:
for framework in benchmark_config.frameworks or [Framework.NONE.value]:
if benchmark_real_name == 'model-benchmarks' or (
':' not in benchmark_name and benchmark_name.endswith('_models')
):
for model in benchmark_config.models:
full_name = f'{benchmark_name}/{framework}-{model}'
logger.info('Executor is going to execute %s.', full_name)
context = BenchmarkRegistry.create_benchmark_context(
model,
platform=self.__get_platform(),
framework=Framework(framework.lower()),
parameters=self.__get_arguments(
{} if 'parameters' not in benchmark_config else benchmark_config.parameters
)
)
result = self.__exec_benchmark(full_name, context)
benchmark_results.append(result)
else:
full_name = benchmark_name
frameworks = benchmark_config.get('frameworks', [Framework.NONE.value])
for framework in frameworks:
if benchmark_real_name == 'model-benchmarks' or (
':' not in benchmark_name and benchmark_name.endswith('_models')
):
for model in benchmark_config.models:
full_name = f'{benchmark_name}/{framework}-{model}'
logger.info('Executor is going to execute %s.', full_name)
context = BenchmarkRegistry.create_benchmark_context(
benchmark_real_name,
model,
platform=self.__get_platform(),
framework=Framework(framework.lower()),
parameters=self.__get_arguments(
Expand All @@ -259,6 +246,18 @@ def exec(self):
)
result = self.__exec_benchmark(full_name, context)
benchmark_results.append(result)
else:
full_name = benchmark_name
logger.info('Executor is going to execute %s.', full_name)
context = BenchmarkRegistry.create_benchmark_context(
benchmark_real_name,
platform=self.__get_platform(),
framework=Framework(framework.lower()),
parameters=self.
__get_arguments({} if 'parameters' not in benchmark_config else benchmark_config.parameters)
)
result = self.__exec_benchmark(full_name, context)
benchmark_results.append(result)

if monitor:
monitor.stop()
Expand Down
4 changes: 2 additions & 2 deletions superbench/runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __validate_sb_config(self): # noqa: C901
if 'proc_num' not in mode:
self._sb_benchmarks[name].modes[idx].proc_num = 8
elif mode.name == 'mpi':
if 'machinefile' not in mode:
if 'mca' not in mode:
RyoYang marked this conversation as resolved.
Show resolved Hide resolved
self._sb_benchmarks[name].modes[idx].mca = {
'pml': 'ob1',
'btl': '^openib',
Expand Down Expand Up @@ -448,7 +448,7 @@ def _run_proc(self, benchmark_name, mode, vars):
mode.env.update({'SB_MODE_SERIAL_INDEX': mode.serial_index, 'SB_MODE_PARALLEL_INDEX': mode.parallel_index})
logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank)

timeout = self._sb_benchmarks[benchmark_name].get('timeout', 60)
timeout = self._sb_benchmarks[benchmark_name].get('timeout', None)
RyoYang marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(timeout, int):
timeout = max(timeout, 60)

Expand Down
4 changes: 3 additions & 1 deletion tests/executor/test_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,5 +166,7 @@ def test_exec_default_benchmarks(self, mock_launch_benchmark):
self.assertTrue(p.is_dir())
self.assertTrue((p / 'results.json').is_file())
with (p / 'results.json').open() as f:
for result in json.load(f):
results = json.load(f)
self.assertTrue(len(results) > 0)
for result in results:
self.assertIn(benchmark_name, result['name'])
43 changes: 43 additions & 0 deletions tests/runner/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ def test_set_logger(self):
expected_log_file = Path(self.runner._sb_output_dir) / 'sb-run.log'
self.assertTrue(expected_log_file.is_file())

def test_validate_sb_config(self):
"""Test validate_sb_config."""
self.runner._SuperBenchRunner__validate_sb_config()
self.assertIn('env', self.runner._sb_config.superbench)
for name in self.runner._sb_benchmarks:
self.assertIn('modes', self.runner._sb_config.superbench.benchmarks[name])
for mode in self.runner._sb_config.superbench.benchmarks[name].modes:
self.assertIn('env', mode)
if mode.name == 'local':
self.assertIn('proc_num', mode)
self.assertIn('prefix', mode)
if mode.name == 'torch.distributed':
self.assertIn('proc_num', mode)
if mode.name == 'mpi':
self.assertIn('mca', mode)

def test_get_failure_count(self):
"""Test get_failure_count."""
self.assertEqual(0, self.runner.get_failure_count())
Expand Down Expand Up @@ -410,3 +426,30 @@ def test_generate_metric_name(self):
test_case['run_count'], test_case['curr_rank'], test_case['curr_run']
), test_case['expected']
)

def test_run_proc_timeout(self):
"""Test run_proc_ timeout."""
self.runner._sb_benchmarks = {
'benchmark1': {
'timeout': 120
},
'benchmark2': {
'timeout': None
},
'benchmark3': {
'timeout': 30
},
}

test_cases = [
('benchmark1', 120),
('benchmark2', None),
('benchmark3', 60),
]

for benchmark_name, expected_timeout in test_cases:
with self.subTest(benchmark_name=benchmark_name):
timeout = self.runner._sb_benchmarks[benchmark_name].get('timeout', None)
if isinstance(timeout, int):
timeout = max(timeout, 60)
self.assertEqual(timeout, expected_timeout)
Loading