Merge branch 'pytorch:main' into main

pytorch · Dec 9, 2024 · 7ef5572 · 7ef5572
2 parents dfce935 + 692bf64
commit 7ef5572
Show file tree

Hide file tree

Showing 129 changed files with 319 additions and 365 deletions.
diff --git a/.github/scripts/abtest.py b/.github/scripts/abtest.py
@@ -199,9 +199,7 @@ def validate_results(a, b) -> bool:
             args.pytorch_repo
         ).is_dir(), f"Specified PyTorch repo dir {args.pytorch_repo} doesn't exist."
         commits = gitutils.get_git_commits(args.pytorch_repo, args.base, args.head)
-        assert (
-            commits
-        ), f"Can't find git commit {args.base} or {args.head} in repo {args.pytorch_repo}"
+        assert commits, f"Can't find git commit {args.base} or {args.head} in repo {args.pytorch_repo}"
     # setup cuda environment
     cuda_env = prepare_cuda_env(cuda_version=DEFAULT_CUDA_VERSION)
     result_a = run_commit(

diff --git a/.github/scripts/bmutils/analyze-bisection-result.py b/.github/scripts/bmutils/analyze-bisection-result.py
@@ -17,21 +17,15 @@ def check_env(bisection_root: str):
     assert (
         bisection_path.is_dir()
     ), f"Specified bisection root {bisection_path} is not a directory."
-    assert bisection_path.joinpath(
-        "gh-issue.md"
-    ).exists(), (
-        f"Bisection directory {bisection_path} doesn't contain file gh-issue.md."
-    )
-    assert bisection_path.joinpath(
-        "result.json"
-    ).exists(), (
-        f"Bisection directory {bisection_path} doesn't contain file result.json."
-    )
-    assert bisection_path.joinpath(
-        "config.yaml"
-    ).exists(), (
-        f"Bisection directory {bisection_path} doesn't contain file config.yaml."
-    )
+    assert (
+        bisection_path.joinpath("gh-issue.md").exists()
+    ), f"Bisection directory {bisection_path} doesn't contain file gh-issue.md."
+    assert (
+        bisection_path.joinpath("result.json").exists()
+    ), f"Bisection directory {bisection_path} doesn't contain file result.json."
+    assert (
+        bisection_path.joinpath("config.yaml").exists()
+    ), f"Bisection directory {bisection_path} doesn't contain file config.yaml."
 
 
 def setup_gh_issue(bisection_root: str, gh_workflow_id: str):

diff --git a/.github/scripts/run-config.py b/.github/scripts/run-config.py
@@ -59,9 +59,7 @@ def get_models(config) -> Optional[str]:
         r = re.compile(model_pattern)
         matched_models = list(filter(lambda x: r.match(x), models))
         enabled_models.extend(matched_models)
-    assert (
-        enabled_models
-    ), f"The model patterns you specified {config['models']} does not match any model. Please double check."
+    assert enabled_models, f"The model patterns you specified {config['models']} does not match any model. Please double check."
     return enabled_models
 
 

diff --git a/.github/scripts/userbenchmark/aicluster.py b/.github/scripts/userbenchmark/aicluster.py
@@ -236,9 +236,7 @@ def run_aicluster_benchmark(
     index = get_metrics_index(s3, benchmark_name, work_dir)
     # if the previous run is not successful, exit immediately
     if check_success and not determine_success_today(index):
-        assert (
-            False
-        ), f"Don't find the last successful run in index: { index }. Please report a bug."
+        assert False, f"Don't find the last successful run in index: { index }. Please report a bug."
     # upload to scribe by the index
     if upload_scribe:
         upload_metrics_to_scribe(s3, benchmark_name, index, work_dir)

diff --git a/bisection.py b/bisection.py
@@ -632,8 +632,8 @@ def main() -> None:
     if args.skip_update:
         skip_update_repos = list(map(lambda x: x.strip(), args.skip_update.split(",")))
         for repo in skip_update_repos:
-            assert repo in list(
-                TORCHBENCH_BISECTION_TARGETS.keys()
+            assert (
+                repo in list(TORCHBENCH_BISECTION_TARGETS.keys())
             ), f"User specified skip update repo {repo} not in list: {TORCHBENCH_BISECTION_TARGETS.keys()}"
     else:
         skip_update_repos = None

diff --git a/install.py b/install.py
@@ -103,6 +103,8 @@
             cmd.extend(["--skip"] + args.skip)
         if args.canary:
             cmd.extend(["--canary"])
+        if args.continue_on_fail:
+            cmd.extend(["--continue_on_fail"])
         cmd.extend(extra_args)
         if userbenchmark_dir.joinpath("install.py").is_file():
             # add the current run env to PYTHONPATH to load framework install utils

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,4 +1,5 @@
 [build-system]
+requires = ["setuptools", "wheel"]
 # Use legacy backend to import local packages in setup.py
 build-backend = "setuptools.build_meta:__legacy__"
 

diff --git a/regression_detector.py b/regression_detector.py
@@ -257,9 +257,7 @@ def get_metrics_by_date(
         if metric_datetime.date() == pick_date.date():
             pick_metrics_json_key = metrics_json_key
             break
-    assert (
-        pick_metrics_json_key
-    ), f"Selected date {pick_date} is not found in the latest_metrics_jsons: {latest_metrics_jsons}"
+    assert pick_metrics_json_key, f"Selected date {pick_date} is not found in the latest_metrics_jsons: {latest_metrics_jsons}"
     s3 = S3Client(USERBENCHMARK_S3_BUCKET, USERBENCHMARK_S3_OBJECT)
     metrics_json = s3.get_file_as_json(pick_metrics_json_key)
     return (metrics_json, pick_metrics_json_key)

diff --git a/requirements.txt b/requirements.txt
@@ -19,7 +19,7 @@ pyyaml
 numpy
 opencv-python
 submitit
-pynvml
+pynvml>=12.0.0
 pandas
 scipy
 numba
diff --git a/run.py b/run.py
@@ -44,12 +44,10 @@
 
 
 def run_one_step_with_cudastreams(func, streamcount):
-
     print("Running Utilization Scaling Using Cuda Streams")
 
     streamlist = []
     for i in range(1, streamcount + 1, 1):
-
         # create additional streams and prime with load
         while len(streamlist) < i:
             s = torch.cuda.Stream()

diff --git a/scripts/proper_bs.py b/scripts/proper_bs.py
@@ -121,9 +121,7 @@ def _run_model_test_proper_bs(
         except NotImplementedError as e:
             status = "NotImplemented"
             error_message = str(e)
-        except (
-            TypeError
-        ) as e:  # TypeError is raised when the model doesn't support variable batch sizes
+        except TypeError as e:  # TypeError is raised when the model doesn't support variable batch sizes
             status = "TypeError"
             error_message = str(e)
         except KeyboardInterrupt as e:

diff --git a/scripts/upload_scribe.py b/scripts/upload_scribe.py
@@ -31,7 +31,6 @@ def format_message(self, field_dict):
             elif field in self.schema["float"]:
                 message["float"][field] = float(value)
             else:
-
                 raise ValueError(
                     "Field {} is not currently used, "
                     "be intentional about adding new fields".format(field)

diff --git a/scripts/upload_scribe_v2.py b/scripts/upload_scribe_v2.py
@@ -64,7 +64,6 @@ def format_message(self, field_dict):
             elif field in self.schema["float"]:
                 message["float"][field] = float(value)
             else:
-
                 raise ValueError(
                     "Field {} is not currently used, "
                     "be intentional about adding new fields".format(field)

diff --git a/test.py b/test.py
@@ -55,7 +55,6 @@ def _create_example_model_instance(task: ModelTask, device: str):
 
 
 def _load_test(path, device):
-
     model_name = os.path.basename(path)
 
     def _skip_cuda_memory_check_p(metadata):

diff --git a/test_bench.py b/test_bench.py
@@ -60,7 +60,6 @@ def pytest_generate_tests(metafunc):
     group="hub",
 )
 class TestBenchNetwork:
-
     def test_train(self, model_path, device, benchmark):
         try:
             model_name = os.path.basename(model_path)

diff --git a/torchbenchmark/__init__.py b/torchbenchmark/__init__.py
@@ -304,7 +304,6 @@ def args(self) -> List[str]:
 
 
 class ModelTask(base_task.TaskBase):
-
     # The worker may (and often does) consume significant system resources.
     # In order to ensure that runs do not interfere with each other, we only
     # allow a single ModelTask to exist at a time.

diff --git a/torchbenchmark/_components/_impl/workers/subprocess_rpc.py b/torchbenchmark/_components/_impl/workers/subprocess_rpc.py
@@ -274,8 +274,10 @@ def write(self, msg: bytes) -> None:
     def get_writer_pid(self) -> int:
         assert (
             self._writer_pid is not None
-        ), "Writer pid is not specified. Maybe calling from child process or input pipe.\
+        ), (
+            "Writer pid is not specified. Maybe calling from child process or input pipe.\
                                               Please report a bug."
+        )
         return self._writer_pid
 
     def set_writer_pid(self, writer_pid: int) -> None:

diff --git a/torchbenchmark/_components/model_analyzer/TorchBenchAnalyzer.py b/torchbenchmark/_components/model_analyzer/TorchBenchAnalyzer.py
@@ -258,9 +258,9 @@ def export_all_records_to_csv(self):
                 ]
                 cluster_records.sort(key=lambda x: x.timestamp())
                 for record in cluster_records:
-                    csv_records[gpu_uuid][record_type][
-                        record.timestamp()
-                    ] = record.value()
+                    csv_records[gpu_uuid][record_type][record.timestamp()] = (
+                        record.value()
+                    )
         with open(self.export_csv_name, "w") as fout:
             for gpu_uuid in csv_records:
                 # timestamp record in DCGM is microsecond

diff --git a/torchbenchmark/_components/model_analyzer/dcgm/dcgm_field_helpers.py b/torchbenchmark/_components/model_analyzer/dcgm/dcgm_field_helpers.py
@@ -147,7 +147,6 @@ def default(self, obj):  # pylint: disable=E0202
 
 
 def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues, userData):
-
     userData = ctypes.cast(userData, ctypes.py_object).value
     userData._ProcessValues(gpuId, values[0:numValues])
     return 0
@@ -363,7 +362,6 @@ def GetAllSinceLastCall(self):
 def py_helper_dcgm_field_values_since_entity_callback(
     entityGroupId, entityId, values, numValues, userData
 ):
-
     userData = ctypes.cast(userData, ctypes.py_object).value
     userData._ProcessValues(entityGroupId, entityId, values[0:numValues])
     return 0
@@ -382,9 +380,7 @@ def py_helper_dcgm_field_values_since_entity_callback(
 
 class DcgmFieldValueEntityCollection:
     def __init__(self, handle, groupId):
-        self.values = (
-            {}
-        )  # 3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
+        self.values = {}  # 3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
         self._handle = handle
         self._groupId = groupId
         self._numValuesSeen = 0
@@ -408,9 +404,9 @@ def _ProcessValues(self, entityGroupId, entityId, values):
             value = DcgmFieldValue(rawValue)
 
             if value.fieldId not in self.values[entityGroupId][entityId]:
-                self.values[entityGroupId][entityId][
-                    value.fieldId
-                ] = DcgmFieldValueTimeSeries()
+                self.values[entityGroupId][entityId][value.fieldId] = (
+                    DcgmFieldValueTimeSeries()
+                )
 
             self.values[entityGroupId][entityId][value.fieldId].InsertValue(value)
 

diff --git a/torchbenchmark/_components/model_analyzer/dcgm/dcgm_monitor.py b/torchbenchmark/_components/model_analyzer/dcgm/dcgm_monitor.py
@@ -123,7 +123,6 @@ def _collect_records(self):
                 for metric_type in self._metrics:
                     dcgm_field = self.model_analyzer_to_dcgm_field[metric_type]
                     for measurement in metrics[dcgm_field].values:
-
                         if measurement.value is not None:
                             # DCGM timestamp is in nanoseconds
                             records.append(

diff --git a/torchbenchmark/_components/model_analyzer/dcgm/dcgm_structs.py b/torchbenchmark/_components/model_analyzer/dcgm/dcgm_structs.py
@@ -90,9 +90,7 @@
     -21
 )  # Connection to the host engine is not valid any longer
 DCGM_ST_GPU_NOT_SUPPORTED = -22  # This GPU is not supported by DCGM
-DCGM_ST_GROUP_INCOMPATIBLE = (
-    -23
-)  # The GPUs of the provided group are not compatible with each other for the requested operation
+DCGM_ST_GROUP_INCOMPATIBLE = -23  # The GPUs of the provided group are not compatible with each other for the requested operation
 DCGM_ST_MAX_LIMIT = -24
 DCGM_ST_LIBRARY_NOT_FOUND = -25  # DCGM library could not be found
 DCGM_ST_DUPLICATE_KEY = -26  # Duplicate key passed to the function
@@ -111,9 +109,7 @@
 DCGM_ST_MODULE_NOT_LOADED = (
     -33
 )  # This request is serviced by a module of DCGM that is not currently loaded
-DCGM_ST_IN_USE = (
-    -34
-)  # The requested operation could not be completed because the affected resource is in use
+DCGM_ST_IN_USE = -34  # The requested operation could not be completed because the affected resource is in use
 DCGM_ST_GROUP_IS_EMPTY = (
     -35
 )  # The specified group is empty and this operation is not valid with an empty group
@@ -126,9 +122,7 @@
 DCGM_ST_PROFILING_MULTI_PASS = (
     -38
 )  # The requested profiling metrics cannot be collected in a single pass
-DCGM_ST_DIAG_ALREADY_RUNNING = (
-    -39
-)  # A diag instance is already running, cannot run a new diag until the current one finishes.
+DCGM_ST_DIAG_ALREADY_RUNNING = -39  # A diag instance is already running, cannot run a new diag until the current one finishes.
 DCGM_ST_DIAG_BAD_JSON = (
     -40
 )  # The DCGM GPU Diagnostic returned JSON that cannot be parsed

diff --git a/torchbenchmark/_components/model_analyzer/dcgm/dcgm_value.py b/torchbenchmark/_components/model_analyzer/dcgm/dcgm_value.py
@@ -124,7 +124,6 @@ def __str__(self):
 
 ###############################################################################
 def self_test():
-
     v = DcgmValue(1.0)
     assert not v.IsBlank()
     assert v.value == 1.0

diff --git a/torchbenchmark/_components/model_analyzer/dcgm/nvml_monitor.py b/torchbenchmark/_components/model_analyzer/dcgm/nvml_monitor.py
@@ -2,8 +2,6 @@
 
 import pynvml
 
-from packaging import version
-
 from ..tb_dcgm_types.gpu_free_memory import GPUFreeMemory
 from ..tb_dcgm_types.gpu_peak_memory import GPUPeakMemory
 from ..tb_dcgm_types.gpu_power_usage import GPUPowerUsage
@@ -14,9 +12,7 @@
 
 
 class NVMLMonitor(Monitor):
-    """
-    Use NVML to monitor GPU metrics
-    """
+    """Use NVML to monitor GPU metrics."""
 
     # Mapping between the NVML Fields and Model Analyzer Records
     # For more explainations, please refer to https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html
@@ -28,7 +24,8 @@ class NVMLMonitor(Monitor):
     }
 
     def __init__(self, gpus, frequency, metrics):
-        """
+        """Initialize the NVML monitor.
+
         Parameters
         ----------
         gpus : list of GPUDevice
@@ -48,24 +45,13 @@ def __init__(self, gpus, frequency, metrics):
         self._gpus = gpus
         # gpu handles: {gpu: handle}
         self._gpu_handles = {}
-        self._nvmlDeviceGetHandleByUUID = None
-        self.check_nvml_compatibility()
+        self._nvmlDeviceGetHandleByUUID = self._nvml.nvmlDeviceGetHandleByUUID
         for gpu in self._gpus:
             self._gpu_handles[gpu] = self._nvmlDeviceGetHandleByUUID(gpu.device_uuid())
             self._records[gpu] = {}
             for metric in self._metrics:
                 self._records[gpu][metric] = []
 
-    def check_nvml_compatibility(self):
-        # check pynvml version, if it is less than 11.5.0, convert uuid to bytes
-        current_version = version.parse(pynvml.__version__)
-        if current_version < version.parse("11.5.0"):
-            self._nvmlDeviceGetHandleByUUID = (
-                self._nvmlDeviceGetHandleByUUID_for_older_pynvml
-            )
-        else:
-            self._nvmlDeviceGetHandleByUUID = self._nvml.nvmlDeviceGetHandleByUUID
-
     def _nvmlDeviceGetHandleByUUID_for_older_pynvml(self, uuid):
         return self._nvml.nvmlDeviceGetHandleByUUID(uuid.encode("ascii"))
 

diff --git a/torchbenchmark/_components/model_analyzer/tb_dcgm_types/record_aggregator.py b/torchbenchmark/_components/model_analyzer/tb_dcgm_types/record_aggregator.py
@@ -193,9 +193,9 @@ def groupby_wo_aggregate(self, record_types, groupby_criterion):
                     record_types=[record_type],
                     filters=[lambda r: groupby_criterion(r) == field_value],
                 )
-                groupby_result[record_type][
-                    field_value
-                ] = temp_records_aggregator.get_records()
+                groupby_result[record_type][field_value] = (
+                    temp_records_aggregator.get_records()
+                )
         return groupby_result
 
     def record_types(self):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -55,7 +55,6 @@ def _create_example_model_instance(task: ModelTask, device: str):


		def _load_test(path, device):

		model_name = os.path.basename(path)

		def _skip_cuda_memory_check_p(metadata):
Expand Down