Skip to content

Commit

Permalink
Merge branch 'pytorch:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
juliagmt-google authored Dec 9, 2024
2 parents dfce935 + 692bf64 commit 7ef5572
Show file tree
Hide file tree
Showing 129 changed files with 319 additions and 365 deletions.
4 changes: 1 addition & 3 deletions .github/scripts/abtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,7 @@ def validate_results(a, b) -> bool:
args.pytorch_repo
).is_dir(), f"Specified PyTorch repo dir {args.pytorch_repo} doesn't exist."
commits = gitutils.get_git_commits(args.pytorch_repo, args.base, args.head)
assert (
commits
), f"Can't find git commit {args.base} or {args.head} in repo {args.pytorch_repo}"
assert commits, f"Can't find git commit {args.base} or {args.head} in repo {args.pytorch_repo}"
# setup cuda environment
cuda_env = prepare_cuda_env(cuda_version=DEFAULT_CUDA_VERSION)
result_a = run_commit(
Expand Down
24 changes: 9 additions & 15 deletions .github/scripts/bmutils/analyze-bisection-result.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,15 @@ def check_env(bisection_root: str):
assert (
bisection_path.is_dir()
), f"Specified bisection root {bisection_path} is not a directory."
assert bisection_path.joinpath(
"gh-issue.md"
).exists(), (
f"Bisection directory {bisection_path} doesn't contain file gh-issue.md."
)
assert bisection_path.joinpath(
"result.json"
).exists(), (
f"Bisection directory {bisection_path} doesn't contain file result.json."
)
assert bisection_path.joinpath(
"config.yaml"
).exists(), (
f"Bisection directory {bisection_path} doesn't contain file config.yaml."
)
assert (
bisection_path.joinpath("gh-issue.md").exists()
), f"Bisection directory {bisection_path} doesn't contain file gh-issue.md."
assert (
bisection_path.joinpath("result.json").exists()
), f"Bisection directory {bisection_path} doesn't contain file result.json."
assert (
bisection_path.joinpath("config.yaml").exists()
), f"Bisection directory {bisection_path} doesn't contain file config.yaml."


def setup_gh_issue(bisection_root: str, gh_workflow_id: str):
Expand Down
4 changes: 1 addition & 3 deletions .github/scripts/run-config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,7 @@ def get_models(config) -> Optional[str]:
r = re.compile(model_pattern)
matched_models = list(filter(lambda x: r.match(x), models))
enabled_models.extend(matched_models)
assert (
enabled_models
), f"The model patterns you specified {config['models']} does not match any model. Please double check."
assert enabled_models, f"The model patterns you specified {config['models']} does not match any model. Please double check."
return enabled_models


Expand Down
4 changes: 1 addition & 3 deletions .github/scripts/userbenchmark/aicluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,7 @@ def run_aicluster_benchmark(
index = get_metrics_index(s3, benchmark_name, work_dir)
# if the previous run is not successful, exit immediately
if check_success and not determine_success_today(index):
assert (
False
), f"Don't find the last successful run in index: { index }. Please report a bug."
assert False, f"Don't find the last successful run in index: { index }. Please report a bug."
# upload to scribe by the index
if upload_scribe:
upload_metrics_to_scribe(s3, benchmark_name, index, work_dir)
Expand Down
4 changes: 2 additions & 2 deletions bisection.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,8 +632,8 @@ def main() -> None:
if args.skip_update:
skip_update_repos = list(map(lambda x: x.strip(), args.skip_update.split(",")))
for repo in skip_update_repos:
assert repo in list(
TORCHBENCH_BISECTION_TARGETS.keys()
assert (
repo in list(TORCHBENCH_BISECTION_TARGETS.keys())
), f"User specified skip update repo {repo} not in list: {TORCHBENCH_BISECTION_TARGETS.keys()}"
else:
skip_update_repos = None
Expand Down
2 changes: 2 additions & 0 deletions install.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@
cmd.extend(["--skip"] + args.skip)
if args.canary:
cmd.extend(["--canary"])
if args.continue_on_fail:
cmd.extend(["--continue_on_fail"])
cmd.extend(extra_args)
if userbenchmark_dir.joinpath("install.py").is_file():
# add the current run env to PYTHONPATH to load framework install utils
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[build-system]
requires = ["setuptools", "wheel"]
# Use legacy backend to import local packages in setup.py
build-backend = "setuptools.build_meta:__legacy__"

Expand Down
4 changes: 1 addition & 3 deletions regression_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,7 @@ def get_metrics_by_date(
if metric_datetime.date() == pick_date.date():
pick_metrics_json_key = metrics_json_key
break
assert (
pick_metrics_json_key
), f"Selected date {pick_date} is not found in the latest_metrics_jsons: {latest_metrics_jsons}"
assert pick_metrics_json_key, f"Selected date {pick_date} is not found in the latest_metrics_jsons: {latest_metrics_jsons}"
s3 = S3Client(USERBENCHMARK_S3_BUCKET, USERBENCHMARK_S3_OBJECT)
metrics_json = s3.get_file_as_json(pick_metrics_json_key)
return (metrics_json, pick_metrics_json_key)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pyyaml
numpy
opencv-python
submitit
pynvml
pynvml>=12.0.0
pandas
scipy
numba
2 changes: 0 additions & 2 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,10 @@


def run_one_step_with_cudastreams(func, streamcount):

print("Running Utilization Scaling Using Cuda Streams")

streamlist = []
for i in range(1, streamcount + 1, 1):

# create additional streams and prime with load
while len(streamlist) < i:
s = torch.cuda.Stream()
Expand Down
4 changes: 1 addition & 3 deletions scripts/proper_bs.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,7 @@ def _run_model_test_proper_bs(
except NotImplementedError as e:
status = "NotImplemented"
error_message = str(e)
except (
TypeError
) as e: # TypeError is raised when the model doesn't support variable batch sizes
except TypeError as e: # TypeError is raised when the model doesn't support variable batch sizes
status = "TypeError"
error_message = str(e)
except KeyboardInterrupt as e:
Expand Down
1 change: 0 additions & 1 deletion scripts/upload_scribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def format_message(self, field_dict):
elif field in self.schema["float"]:
message["float"][field] = float(value)
else:

raise ValueError(
"Field {} is not currently used, "
"be intentional about adding new fields".format(field)
Expand Down
1 change: 0 additions & 1 deletion scripts/upload_scribe_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def format_message(self, field_dict):
elif field in self.schema["float"]:
message["float"][field] = float(value)
else:

raise ValueError(
"Field {} is not currently used, "
"be intentional about adding new fields".format(field)
Expand Down
1 change: 0 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def _create_example_model_instance(task: ModelTask, device: str):


def _load_test(path, device):

model_name = os.path.basename(path)

def _skip_cuda_memory_check_p(metadata):
Expand Down
1 change: 0 additions & 1 deletion test_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def pytest_generate_tests(metafunc):
group="hub",
)
class TestBenchNetwork:

def test_train(self, model_path, device, benchmark):
try:
model_name = os.path.basename(model_path)
Expand Down
1 change: 0 additions & 1 deletion torchbenchmark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,6 @@ def args(self) -> List[str]:


class ModelTask(base_task.TaskBase):

# The worker may (and often does) consume significant system resources.
# In order to ensure that runs do not interfere with each other, we only
# allow a single ModelTask to exist at a time.
Expand Down
4 changes: 3 additions & 1 deletion torchbenchmark/_components/_impl/workers/subprocess_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,10 @@ def write(self, msg: bytes) -> None:
def get_writer_pid(self) -> int:
assert (
self._writer_pid is not None
), "Writer pid is not specified. Maybe calling from child process or input pipe.\
), (
"Writer pid is not specified. Maybe calling from child process or input pipe.\
Please report a bug."
)
return self._writer_pid

def set_writer_pid(self, writer_pid: int) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,9 @@ def export_all_records_to_csv(self):
]
cluster_records.sort(key=lambda x: x.timestamp())
for record in cluster_records:
csv_records[gpu_uuid][record_type][
record.timestamp()
] = record.value()
csv_records[gpu_uuid][record_type][record.timestamp()] = (
record.value()
)
with open(self.export_csv_name, "w") as fout:
for gpu_uuid in csv_records:
# timestamp record in DCGM is microsecond
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def default(self, obj): # pylint: disable=E0202


def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues, userData):

userData = ctypes.cast(userData, ctypes.py_object).value
userData._ProcessValues(gpuId, values[0:numValues])
return 0
Expand Down Expand Up @@ -363,7 +362,6 @@ def GetAllSinceLastCall(self):
def py_helper_dcgm_field_values_since_entity_callback(
entityGroupId, entityId, values, numValues, userData
):

userData = ctypes.cast(userData, ctypes.py_object).value
userData._ProcessValues(entityGroupId, entityId, values[0:numValues])
return 0
Expand All @@ -382,9 +380,7 @@ def py_helper_dcgm_field_values_since_entity_callback(

class DcgmFieldValueEntityCollection:
def __init__(self, handle, groupId):
self.values = (
{}
) # 3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
self.values = {} # 3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
self._handle = handle
self._groupId = groupId
self._numValuesSeen = 0
Expand All @@ -408,9 +404,9 @@ def _ProcessValues(self, entityGroupId, entityId, values):
value = DcgmFieldValue(rawValue)

if value.fieldId not in self.values[entityGroupId][entityId]:
self.values[entityGroupId][entityId][
value.fieldId
] = DcgmFieldValueTimeSeries()
self.values[entityGroupId][entityId][value.fieldId] = (
DcgmFieldValueTimeSeries()
)

self.values[entityGroupId][entityId][value.fieldId].InsertValue(value)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def _collect_records(self):
for metric_type in self._metrics:
dcgm_field = self.model_analyzer_to_dcgm_field[metric_type]
for measurement in metrics[dcgm_field].values:

if measurement.value is not None:
# DCGM timestamp is in nanoseconds
records.append(
Expand Down
12 changes: 3 additions & 9 deletions torchbenchmark/_components/model_analyzer/dcgm/dcgm_structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,7 @@
-21
) # Connection to the host engine is not valid any longer
DCGM_ST_GPU_NOT_SUPPORTED = -22 # This GPU is not supported by DCGM
DCGM_ST_GROUP_INCOMPATIBLE = (
-23
) # The GPUs of the provided group are not compatible with each other for the requested operation
DCGM_ST_GROUP_INCOMPATIBLE = -23 # The GPUs of the provided group are not compatible with each other for the requested operation
DCGM_ST_MAX_LIMIT = -24
DCGM_ST_LIBRARY_NOT_FOUND = -25 # DCGM library could not be found
DCGM_ST_DUPLICATE_KEY = -26 # Duplicate key passed to the function
Expand All @@ -111,9 +109,7 @@
DCGM_ST_MODULE_NOT_LOADED = (
-33
) # This request is serviced by a module of DCGM that is not currently loaded
DCGM_ST_IN_USE = (
-34
) # The requested operation could not be completed because the affected resource is in use
DCGM_ST_IN_USE = -34 # The requested operation could not be completed because the affected resource is in use
DCGM_ST_GROUP_IS_EMPTY = (
-35
) # The specified group is empty and this operation is not valid with an empty group
Expand All @@ -126,9 +122,7 @@
DCGM_ST_PROFILING_MULTI_PASS = (
-38
) # The requested profiling metrics cannot be collected in a single pass
DCGM_ST_DIAG_ALREADY_RUNNING = (
-39
) # A diag instance is already running, cannot run a new diag until the current one finishes.
DCGM_ST_DIAG_ALREADY_RUNNING = -39 # A diag instance is already running, cannot run a new diag until the current one finishes.
DCGM_ST_DIAG_BAD_JSON = (
-40
) # The DCGM GPU Diagnostic returned JSON that cannot be parsed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ def __str__(self):

###############################################################################
def self_test():

v = DcgmValue(1.0)
assert not v.IsBlank()
assert v.value == 1.0
Expand Down
22 changes: 4 additions & 18 deletions torchbenchmark/_components/model_analyzer/dcgm/nvml_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import pynvml

from packaging import version

from ..tb_dcgm_types.gpu_free_memory import GPUFreeMemory
from ..tb_dcgm_types.gpu_peak_memory import GPUPeakMemory
from ..tb_dcgm_types.gpu_power_usage import GPUPowerUsage
Expand All @@ -14,9 +12,7 @@


class NVMLMonitor(Monitor):
"""
Use NVML to monitor GPU metrics
"""
"""Use NVML to monitor GPU metrics."""

# Mapping between the NVML Fields and Model Analyzer Records
# For more explainations, please refer to https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html
Expand All @@ -28,7 +24,8 @@ class NVMLMonitor(Monitor):
}

def __init__(self, gpus, frequency, metrics):
"""
"""Initialize the NVML monitor.
Parameters
----------
gpus : list of GPUDevice
Expand All @@ -48,24 +45,13 @@ def __init__(self, gpus, frequency, metrics):
self._gpus = gpus
# gpu handles: {gpu: handle}
self._gpu_handles = {}
self._nvmlDeviceGetHandleByUUID = None
self.check_nvml_compatibility()
self._nvmlDeviceGetHandleByUUID = self._nvml.nvmlDeviceGetHandleByUUID
for gpu in self._gpus:
self._gpu_handles[gpu] = self._nvmlDeviceGetHandleByUUID(gpu.device_uuid())
self._records[gpu] = {}
for metric in self._metrics:
self._records[gpu][metric] = []

def check_nvml_compatibility(self):
# check pynvml version, if it is less than 11.5.0, convert uuid to bytes
current_version = version.parse(pynvml.__version__)
if current_version < version.parse("11.5.0"):
self._nvmlDeviceGetHandleByUUID = (
self._nvmlDeviceGetHandleByUUID_for_older_pynvml
)
else:
self._nvmlDeviceGetHandleByUUID = self._nvml.nvmlDeviceGetHandleByUUID

def _nvmlDeviceGetHandleByUUID_for_older_pynvml(self, uuid):
return self._nvml.nvmlDeviceGetHandleByUUID(uuid.encode("ascii"))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,9 @@ def groupby_wo_aggregate(self, record_types, groupby_criterion):
record_types=[record_type],
filters=[lambda r: groupby_criterion(r) == field_value],
)
groupby_result[record_type][
field_value
] = temp_records_aggregator.get_records()
groupby_result[record_type][field_value] = (
temp_records_aggregator.get_records()
)
return groupby_result

def record_types(self):
Expand Down
Loading

0 comments on commit 7ef5572

Please sign in to comment.