Merge pull request #210 from mdboom/estimate-sample-size

Add warnings about too few or too many samples
psf · Dec 19, 2024 · bbc8e9f · bbc8e9f
2 parents c584266 + a02bff9
commit bbc8e9f
Show file tree

Hide file tree

Showing 5 changed files with 98 additions and 8 deletions.
diff --git a/doc/api.rst b/doc/api.rst
@@ -345,6 +345,19 @@ Benchmark class
 
       Raise an exception if the benchmark has no values.
 
+   .. method:: required_nprocesses()
+
+      Determines the number of separate process runs that would be required
+      achieve stable results. Specifically, the target is to have 95% certainty
+      that there is a variance of less than 1%. If the result is greater than
+      the number of processes recorded in the input data, the value is
+      meaningless and only means "more samples are required".
+
+      The method used is described in this Wikipedia article about estimating
+      the sampling of a mean:
+
+      https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
+
    .. method:: update_metadata(metadata: dict)
 
       Update metadata of all runs of the benchmark.

diff --git a/pyperf/__main__.py b/pyperf/__main__.py
@@ -455,7 +455,8 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False,
                                            dump=dump,
                                            checks=checks,
                                            result=result,
-                                           display_runs_args=display_runs_args)
+                                           display_runs_args=display_runs_args,
+                                           only_checks=only_checks)
 
             if bench_lines:
                 empty_line(lines)
@@ -491,10 +492,13 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False,
                 empty_line(output)
                 output.extend(lines)
 
+        contains_warning = False
         for line in output:
+            if line.startswith("WARNING:"):
+                contains_warning = True
             print(line)
 
-        if not output and only_checks:
+        if not contains_warning and only_checks:
             if len(data) == 1:
                 print("The benchmark seems to be stable")
             else:

diff --git a/pyperf/_bench.py b/pyperf/_bench.py
@@ -424,6 +424,47 @@ def median_abs_dev(self):
             raise ValueError("MAD must be >= 0")
         return value
 
+    def required_nprocesses(self):
+        """
+        Determines the number of separate process runs that would be required
+        achieve stable results. Specifically, the target is to have 95%
+        certainty that there is a variance of less than 1%. If the result is
+        greater than the number of processes recorded in the input data, the
+        value is meaningless and only means "more samples are required".
+
+        The method used is described in this Wikipedia article about estimating
+        the sampling of a mean:
+
+        https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
+        """
+        # Get the means of the values per process. The values within the process
+        # often vary considerably (e.g. due to cache effects), but the variances
+        # between processes should be fairly consistent. Additionally, this
+        # value is intended to be advice for the number of processes to run.
+        values = []
+        for run in self._runs:
+            if len(run.values):
+                values.append(statistics.mean(run.values))
+
+        if len(values) < 2:
+            return None
+
+        total = math.fsum(values)
+        mean = total / len(values)
+        stddev = statistics.stdev(values)
+
+        # Normalize the stddev so we can target "percentage changed" rather than
+        # absolute time
+        sigma = stddev / mean
+
+        # 95% certainty
+        Z = 1.96
+        # 1% variation
+        W = 0.01
+
+        # (4Z²σ²)/(W²)
+        return math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2))
+
     def percentile(self, p):
         if not (0 <= p <= 100):
             raise ValueError("p must be in the range [0; 100]")

diff --git a/pyperf/_cli.py b/pyperf/_cli.py
@@ -400,7 +400,7 @@ def value_bucket(value):
     return lines
 
 
-def format_checks(bench, lines=None):
+def format_checks(bench, lines=None, check_too_many_processes=False):
     if lines is None:
         lines = []
 
@@ -412,6 +412,7 @@ def format_checks(bench, lines=None):
     mean = bench.mean()
     warnings = []
     warn = warnings.append
+    required_nprocesses = None
 
     # Display a warning if the standard deviation is greater than 10%
     # of the mean
@@ -421,6 +422,14 @@ def format_checks(bench, lines=None):
         if percent >= 10.0:
             warn("the standard deviation (%s) is %.0f%% of the mean (%s)"
                  % (bench.format_value(stdev), percent, bench.format_value(mean)))
+        else:
+            # display a warning if the number of samples isn't enough to get a stable result
+            required_nprocesses = bench.required_nprocesses()
+            if (
+                required_nprocesses is not None and
+                required_nprocesses > len(bench._runs)
+            ):
+                warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)")
 
     # Minimum and maximum, detect obvious outliers
     for minimum, value in (
@@ -457,6 +466,19 @@ def format_checks(bench, lines=None):
         lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.")
         lines.append("Use --quiet option to hide these warnings.")
 
+    if check_too_many_processes:
+        if required_nprocesses is None:
+            required_nprocesses = bench.required_nprocesses()
+        if (
+            required_nprocesses is not None and
+            required_nprocesses < len(bench._runs) * 0.75
+        ):
+            lines.append("Benchmark was run more times than necessary to get a stable result.")
+            lines.append(
+                "Consider passing processes=%d to the Runner constructor to save time." %
+                required_nprocesses
+            )
+
     # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata
     for run in bench._runs:
         cpu_config = run._metadata.get('cpu_config')
@@ -549,7 +571,7 @@ def format_result(bench):
 
 def format_benchmark(bench, checks=True, metadata=False,
                      dump=False, stats=False, hist=False, show_name=False,
-                     result=True, display_runs_args=None):
+                     result=True, display_runs_args=None, only_checks=False):
     lines = []
 
     if metadata:
@@ -568,7 +590,7 @@ def format_benchmark(bench, checks=True, metadata=False,
         format_stats(bench, lines=lines)
 
     if checks:
-        format_checks(bench, lines=lines)
+        format_checks(bench, lines=lines, check_too_many_processes=only_checks)
 
     if result:
         empty_line(lines)

diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py
@@ -628,8 +628,18 @@ def test_slowest(self):
 
     def test_check_stable(self):
         stdout = self.run_command('check', TELCO)
-        self.assertEqual(stdout.rstrip(),
-                         'The benchmark seems to be stable')
+        self.assertIn(
+            textwrap.dedent(
+                """
+                Benchmark was run more times than necessary to get a stable result.
+                Consider passing processes=7 to the Runner constructor to save time.
+                """
+            ).strip(), stdout.rstrip()
+        )
+        self.assertIn(
+            'The benchmark seems to be stable',
+            stdout.rstrip()
+        )
 
     def test_command(self):
         command = [sys.executable, '-c', 'pass']
@@ -689,7 +699,7 @@ def _check_track_memory(self, track_option):
                              '[1,2]*1000',
                              '-o', tmp_name)
             bench = pyperf.Benchmark.load(tmp_name)
-        
+
         self._check_track_memory_bench(bench, loops=5)
 
     def test_track_memory(self):