From d4a51e4102c6304279691c384605ee4c490c206d Mon Sep 17 00:00:00 2001 From: "Nagaraj, Sriraksha" Date: Tue, 4 Feb 2025 04:05:38 -0600 Subject: [PATCH] Adding att v3 support (#84) * Adding att v3 support * misc fix * bug fix * Python linting workflow and rules * fix regex * Adding temporary args * fix temporary args * fix format * remove att_perfcounters from test input * Review comments (#163) Co-authored-by: Giovanni Baraldi * Revert "Review comments (#163)" This reverts commit 9ef0f8e5a4489d5581255e1b70ced2aef5c1c1d0. * Address review comments 2 * review changes * review comments * review * cmake alias * review * review * review * review * Enabling percounter in v3 script * review * formatting * formatting --------- Co-authored-by: Jonathan R. Madsen Co-authored-by: Baraldi, Giovanni Co-authored-by: Giovanni Baraldi --- .flake8 | 16 + .github/workflows/python.yml | 39 ++ requirements.txt | 1 + source/bin/rocprofv3.py | 274 +++++++++- source/bin/rocprofv3_avail.py | 1 - source/docs/how-to/using-rocprofv3.rst | 5 + source/docs/rocprofv3-schema.json | 18 + source/lib/output/generateJSON.cpp | 22 +- source/lib/output/metadata.cpp | 32 ++ source/lib/output/metadata.hpp | 28 +- source/lib/rocprofiler-sdk-att/CMakeLists.txt | 1 + .../rocprofiler-sdk-att/att_lib_wrapper.cpp | 12 +- source/lib/rocprofiler-sdk-att/dl.cpp | 30 +- source/lib/rocprofiler-sdk-att/dl.hpp | 2 +- .../rocprofiler-sdk-att/tests/CMakeLists.txt | 11 +- .../waitcnt/tests/CMakeLists.txt | 2 +- .../lib/rocprofiler-sdk-tool/CMakeLists.txt | 3 +- source/lib/rocprofiler-sdk-tool/config.cpp | 64 +++ source/lib/rocprofiler-sdk-tool/config.hpp | 20 +- source/lib/rocprofiler-sdk-tool/tool.cpp | 234 ++++++++- tests/bin/CMakeLists.txt | 1 + tests/bin/hsa-code-object/CMakeLists.txt | 90 ++++ tests/bin/hsa-code-object/copy.cl | 32 ++ tests/bin/hsa-code-object/copy_memory.cl | 32 ++ .../hsa-code-object/hsa_code_object_app.cpp | 492 ++++++++++++++++++ .../bin/hsa-code-object/hsa_code_object_app.h | 415 +++++++++++++++ tests/rocprofv3/CMakeLists.txt | 3 + .../advanced-thread-trace/CMakeLists.txt | 100 ++++ .../advanced-thread-trace/att_input.json | 20 + .../advanced-thread-trace/conftest.py | 59 +++ .../advanced-thread-trace/pytest.ini | 5 + .../advanced-thread-trace/validate.py | 47 ++ 32 files changed, 2068 insertions(+), 43 deletions(-) create mode 100644 .flake8 create mode 100644 .github/workflows/python.yml create mode 100644 tests/bin/hsa-code-object/CMakeLists.txt create mode 100644 tests/bin/hsa-code-object/copy.cl create mode 100644 tests/bin/hsa-code-object/copy_memory.cl create mode 100644 tests/bin/hsa-code-object/hsa_code_object_app.cpp create mode 100644 tests/bin/hsa-code-object/hsa_code_object_app.h create mode 100644 tests/rocprofv3/advanced-thread-trace/CMakeLists.txt create mode 100644 tests/rocprofv3/advanced-thread-trace/att_input.json create mode 100644 tests/rocprofv3/advanced-thread-trace/conftest.py create mode 100644 tests/rocprofv3/advanced-thread-trace/pytest.ini create mode 100644 tests/rocprofv3/advanced-thread-trace/validate.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..0260af02 --- /dev/null +++ b/.flake8 @@ -0,0 +1,16 @@ +[flake8] +max-line-length = + 90 +ignore = + E203, + E501, + W503, + W605, +exclude = + .git, + __pycache__, + scripts, + external, + tests, + build, + build-* diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 00000000..17675bf9 --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,39 @@ +name: Python Linting + +on: + workflow_dispatch: + push: + branches: [ amd-staging, amd-mainline ] + paths: + - '**.py' + pull_request: + paths: + - '**.py' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + linting: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.10', '3.12'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 source --count --show-source --statistics --select=E9,F63,F7,F82 + # flake8 options are defined in .flake8 + flake8 source --count --show-source --statistics diff --git a/requirements.txt b/requirements.txt index 28f19b95..178bc1fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ clang-tidy>=15.0.0,<19.0.0 cmake>=3.21.0 cmake-format dataclasses +flake8 numpy otf2 pandas diff --git a/source/bin/rocprofv3.py b/source/bin/rocprofv3.py index f35fa9fb..9ef5f62f 100755 --- a/source/bin/rocprofv3.py +++ b/source/bin/rocprofv3.py @@ -26,6 +26,7 @@ import sys import argparse import subprocess +import re class dotdict(dict): @@ -87,6 +88,95 @@ def strtobool(val): raise ValueError(f"invalid truth value {val} (type={val_type})") +def search_path(path_list): + supported_option = [] + lib_att_pattern = r"libatt_decoder_(trace|summary|debug|testing)\.so" + file_list = [] + + for path in path_list: + for root, dirs, files in os.walk(path, topdown=True): + file_list.extend(files) + break + for itr in file_list: + _match = re.match(lib_att_pattern, itr) + if _match: + lst = re.findall("trace|debug|summary|testing", itr) + supported_option.extend(lst) + return set(supported_option) + + +def check_att_capability(args): + + path = [] + ROCPROFV3_DIR = os.path.dirname(os.path.realpath(__file__)) + ROCM_DIR = os.path.dirname(ROCPROFV3_DIR) + support_input = {} + tmp_parser = argparse.ArgumentParser(add_help=False) + tmp_parser.add_argument( + "--att-library-path", + default=os.environ.get( + "ATT_LIBRARY_PATH", os.environ.get("LD_LIBRARY_PATH", None) + ), + type=str, + required=False, + ) + + tmp_parser.add_argument( + "-i", + "--input", + default=None, + type=str, + required=False, + ) + + tmp_data = {} + att_args, unparsed_args = tmp_parser.parse_known_args(args) + tmp_keys = list(att_args.__dict__.keys()) + + for itr in tmp_keys: + if has_set_attr(att_args, itr): + tmp_data[itr] = getattr(att_args, itr) + + data = dotdict(tmp_data) + if data.input: + # If index of a pass in input file is a key in the support_input dict, then that pass has att-library-path arg + args_list = parse_input(data.input) + for index, itr in enumerate(args_list): + if itr.att_library_path: + library_path = [] + if ":" in itr.att_library_path: + library_path.extend(itr.att_library_path.split(":")) + else: + library_path.append(itr.att_library_path) + support = search_path(library_path) + # If the att-library-path in the input file for a pass is valid, then the value of index key in the dict, support_input, is updated to that valid path + if support: + support_input[index] = set(support) + else: + # If the att-library-path in the input file for a pass is invalid, then the value of index key in the dict, support_input, is empty + support_input[index] = [] + if data.att_library_path: + if ":" in data.att_library_path: + path.extend(data.att_library_path.split(":")) + else: + path.append(data.att_library_path) + else: + path.append(f"{ROCM_DIR}/lib") + path.append(f"{ROCM_DIR}/lib64") + + support = search_path(set(path)) + if support: + if len(path) == 1: + os.environ["ATT_LIBRARY_PATH"] = path[0] + os.environ["ROCPROF_ATT_LIBRARY_PATH"] = path[0] + else: + os.environ["ATT_LIBRARY_PATH"] = ":".join(path) + os.environ["ROCPROF_ATT_LIBRARY_PATH"] = ":".join(path) + return support, support_input + + return None, support_input + + class booleanArgAction(argparse.Action): def __call__(self, parser, args, value, option_string=None): setattr(args, self.dest, strtobool(value)) @@ -473,6 +563,14 @@ def add_parser_bool_argument(gparser, *args, **kwargs): default=os.environ.get("ROCPROF_PRELOAD", "").split(":"), nargs="*", ) + + advanced_options.add_argument( + "--att-library-path", + default=os.environ.get( + "ATT_LIBRARY_PATH", os.environ.get("LD_LIBRARY_PATH", None) + ), + help="ATT library path to find decoder library", + ) # below is available for CI because LD_PRELOADing a library linked to a sanitizer library # causes issues in apps where HIP is part of shared library. add_parser_bool_argument( @@ -494,7 +592,66 @@ def add_parser_bool_argument(gparser, *args, **kwargs): app_args = args[(idx + 1) :] break - return (parser.parse_args(rocp_args), app_args) + supported_list, is_support_input = check_att_capability(rocp_args) + if supported_list or len(is_support_input) != 0: + choice_list = [] + for keys, values in is_support_input.items(): + choice_list.extend(values) + if supported_list: + choice_list.extend(list(supported_list)) + + att_options = parser.add_argument_group("Advanced Thread Trace") + + add_parser_bool_argument( + att_options, + "--advanced-thread-trace", + help="Enable ATT", + ) + + att_options.add_argument( + "--att-target-cu", + help="ATT target compute unit", + default=None, + type=int, + ) + + att_options.add_argument( + "--att-simd-select", + help="Select ATT SIMD", + default=None, + type=str, + ) + + att_options.add_argument( + "--att-buffer-size", + help="Buffer Size", + default=None, + type=str, + ) + + att_options.add_argument( + "--att-shader-engine-mask", + help="att shader engine mask", + default=None, + type=str, + ) + + att_options.add_argument( + "--att-parse", + type=str.lower, + default=None, + help="Select ATT Parse method from the choices", + choices=set(choice_list), + ) + + add_parser_bool_argument( + att_options, + "--att-serialize-all", + default=False, + help="Serialize all kernels", + ) + + return (parser.parse_args(rocp_args), app_args, supported_list, is_support_input) def parse_yaml(yaml_file): @@ -1048,6 +1205,77 @@ def log_config(_env): update_env("ROCPROF_PC_SAMPLING_METHOD", args.pc_sampling_method) update_env("ROCPROF_PC_SAMPLING_INTERVAL", args.pc_sampling_interval) + if args.advanced_thread_trace: + + def int_auto(num_str): + if "0x" in num_str: + return int(num_str, 16) + else: + return int(num_str, 10) + + if args.pmc or ( + args.pc_sampling_beta_enabled + or args.pc_sampling_unit + or args.pc_sampling_method + or args.pc_sampling_interval + ): + fatal_error( + "Advanced thread trace cannot be enabled with counter collection or pc sampling" + ) + + if not args.att_parse: + fatal_error("provide the parser choice") + + update_env("ROCPROF_ADVANCED_THREAD_TRACE", True, overwrite=True) + update_env("ROCPROF_ATT_CAPABILITY", args.att_parse, overwrite=True) + + if args.att_target_cu: + update_env("ROCPROF_ATT_PARAM_TARGET_CU", args.att_target_cu, overwrite=True) + + if args.att_shader_engine_mask: + update_env( + "ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", + int_auto(args.att_shader_engine_mask), + overwrite=True, + ) + if args.att_buffer_size: + update_env( + "ROCPROF_ATT_PARAM_BUFFER_SIZE", + int_auto(args.att_buffer_size), + overwrite=True, + ) + if args.att_simd_select: + update_env( + "ROCPROF_ATT_PARAM_SIMD_SELECT", + int_auto(args.att_simd_select), + overwrite=True, + ) + if args.att_serialize_all: + update_env( + "ROCPROF_ATT_PARAM_SERIALIZE_ALL", + args.att_serialize_all, + overwrite=True, + ) + + if args.att_library_path: + + update_env( + "ROCPROF_ATT_LIBRARY_PATH", + args.att_library_path, + overwrite=True, + ) + update_env( + "ATT_LIBRARY_PATH", + args.att_library_path, + overwrite=True, + ) + if args.att_percounters: + update_env( + "ROCPROF_ATT_PARAM_PERFCOUNTERS", + " ".join(args.att_perfcounters), + overwrite=True, + ) + if use_execv: # does not return os.execvpe(app_args[0], app_args, env=app_env) @@ -1061,9 +1289,45 @@ def log_config(_env): return exit_code +def check_att_path_parse_method(args, index, support_att_input, att_parse_supported): + + if not att_parse_supported: + if index not in support_att_input.keys(): + fatal_error( + f"Advanced_thread_trace enabled but no decoder library found in cmdline/env paths and att_library_path not set for pass-{index + 1}" + ) + elif not support_att_input[index]: + fatal_error( + f"Advanced_thread_trace enabled but no decoder library found in att_library_path for pass-{index + 1}" + ) + else: + if args.att_parse and args.att_parse not in support_att_input[index]: + fatal_error( + f"Advanced_thread_trace enabled but decoder library for requested parse method not found in att_library_path for pass-{index + 1}" + ) + else: + if index in support_att_input.keys() and not support_att_input[index]: + fatal_error( + f"Advanced_thread_trace enabled but no decoder library found in att_library_path for pass-{index + 1}" + ) + + elif index not in support_att_input.keys(): + if args.att_parse and args.att_parse not in att_parse_supported: + fatal_error( + "Advanced_thread_trace enabled but decoder library for requested parse method not found" + ) + else: + if args.att_parse and args.att_parse not in support_att_input[index]: + fatal_error( + f"Advanced_thread_trace enabled but decoder library for requested parse method not found for pass-{index + 1}" + ) + + def main(argv=None): - cmd_args, app_args = parse_arguments(argv) + # att_parse_supported is valid path for decoder in env or commandline arg + # support_att_input is a dict, where key is a pass index with value being a valid decoder path + cmd_args, app_args, att_parse_supported, support_att_input = parse_arguments(argv) inp_args = ( parse_input(cmd_args.input) if getattr(cmd_args, "input") else [dotdict({})] ) @@ -1073,10 +1337,16 @@ def main(argv=None): pass_idx = None if has_set_attr(args, "pmc") and len(args.pmc) > 0: pass_idx = 1 + if args.advanced_thread_trace: + check_att_path_parse_method(args, 0, support_att_input, att_parse_supported) run(app_args, args, pass_id=pass_idx) else: for idx, itr in enumerate(inp_args): args = get_args(cmd_args, itr) + if args.advanced_thread_trace: + check_att_path_parse_method( + args, idx, support_att_input, att_parse_supported + ) run( app_args, args, diff --git a/source/bin/rocprofv3_avail.py b/source/bin/rocprofv3_avail.py index 5911a67a..98b6f8ed 100644 --- a/source/bin/rocprofv3_avail.py +++ b/source/bin/rocprofv3_avail.py @@ -23,7 +23,6 @@ # THE SOFTWARE. import ctypes -import pathlib import os import io import csv diff --git a/source/docs/how-to/using-rocprofv3.rst b/source/docs/how-to/using-rocprofv3.rst index 3947bf1e..62ef041c 100644 --- a/source/docs/how-to/using-rocprofv3.rst +++ b/source/docs/how-to/using-rocprofv3.rst @@ -805,6 +805,11 @@ Properties - **``pc_sampling_interval``** *(integer)*: pc sampling interval. - **``pc-sampling-beta-enabled``** *(boolean)*: enable pc sampling support; beta version. + - **``att_filenames``** *(object)* + - **``key``** *(integer)*: Dispatch id. + - **``value``** *(array)*: An array of ATT filenames. + - **``code_object_snapshot_filenames``** *(array)*: Code + object snapshot filename. .. code-block:: shell diff --git a/source/docs/rocprofv3-schema.json b/source/docs/rocprofv3-schema.json index 012e64f2..581e13fd 100644 --- a/source/docs/rocprofv3-schema.json +++ b/source/docs/rocprofv3-schema.json @@ -567,6 +567,24 @@ "pc_sample_comments": { "type": "array", "description": "Comments matching assembly instructions from pc_sample_instructions array. If debug symbols are available, comments provide instructions to source-line mapping. Otherwise, a comment is an empty string." + }, + "att_filenames" : { + "type": "object", + "properties": { + "key": { + "type": "integer", + "description": "Dispatch id." + }, + "value": { + "type": "array", + "description": "An array of ATT filenames." + } + + } + }, + "code_object_snapshot_filenames": { + "type": "array", + "description": "Code object snapshot filename" } } }, diff --git a/source/lib/output/generateJSON.cpp b/source/lib/output/generateJSON.cpp index 171f2a41..6b034d30 100644 --- a/source/lib/output/generateJSON.cpp +++ b/source/lib/output/generateJSON.cpp @@ -25,6 +25,7 @@ #include "statistics.hpp" #include "timestamps.hpp" +#include "lib/common/filesystem.hpp" #include "lib/common/string_entry.hpp" #include "lib/common/utility.hpp" @@ -37,6 +38,7 @@ namespace rocprofiler { namespace tool { +namespace fs = common::filesystem; json_output::json_output(const output_config& cfg, std::string_view filename, JSONOutputArchive::Options _opts) @@ -115,11 +117,18 @@ write_json(json_output& json_ar, json_ar(cereal::make_nvp("counters", tool_metadata.get_counter_info())); { - auto callback_name_info = tool_metadata.callback_names; - auto buffer_name_info = tool_metadata.buffer_names; - auto counter_dims = tool_metadata.get_counter_dimension_info(); - auto marker_msg_data = tool_metadata.marker_messages.get(); - + auto callback_name_info = tool_metadata.callback_names; + auto buffer_name_info = tool_metadata.buffer_names; + auto counter_dims = tool_metadata.get_counter_dimension_info(); + auto marker_msg_data = tool_metadata.marker_messages.get(); + auto code_object_load_info = tool_metadata.get_code_object_load_info(); + auto att_filenames = tool_metadata.get_att_filenames(); + auto code_object_snapshot_filenames = std::vector{}; + code_object_snapshot_filenames.reserve(code_object_load_info.size()); + for(auto info : code_object_load_info) + { + code_object_snapshot_filenames.emplace_back(fs::path(info.name).filename()); + } json_ar.setNextName("strings"); json_ar.startNode(); json_ar(cereal::make_nvp("callback_records", callback_name_info)); @@ -131,7 +140,8 @@ write_json(json_output& json_ar, json_ar( cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions())); json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments())); - + json_ar(cereal::make_nvp("att_filenames", att_filenames)); + json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames)); { auto _extern_corr_id_strings = std::map{}; if(cfg.kernel_rename) diff --git a/source/lib/output/metadata.cpp b/source/lib/output/metadata.cpp index b7dcb54a..f1d2a835 100644 --- a/source/lib/output/metadata.cpp +++ b/source/lib/output/metadata.cpp @@ -22,12 +22,15 @@ #include "metadata.hpp" +#include "lib/common/filesystem.hpp" #include "lib/common/string_entry.hpp" #include "lib/output/agent_info.hpp" #include "lib/output/host_symbol_info.hpp" #include "lib/output/kernel_symbol_info.hpp" +#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp" #include +#include #include #include @@ -36,6 +39,7 @@ namespace rocprofiler { namespace tool { +namespace fs = common::filesystem; namespace { rocprofiler_status_t @@ -218,6 +222,34 @@ metadata::get_code_object(uint64_t code_obj_id) const }); } +code_object_load_info_vec_t +metadata::get_code_object_load_info() const +{ + auto _data = code_object_load.rlock([](const auto& _data_v) { + auto _info = std::vector{}; + _info.reserve(_data_v.size()); + for(const auto& itr : _data_v) + _info.emplace_back(itr); + return _info; + }); + + return _data; +} + +std::vector +metadata::get_att_filenames() const +{ + auto data = std::vector{}; + for(auto filenames : att_filenames) + { + for(auto file : filenames.second.second) + { + data.emplace_back(fs::path(file).filename()); + } + } + return data; +} + const kernel_symbol_info* metadata::get_kernel_symbol(uint64_t kernel_id) const { diff --git a/source/lib/output/metadata.hpp b/source/lib/output/metadata.hpp index 66ac7586..5182f4fa 100644 --- a/source/lib/output/metadata.hpp +++ b/source/lib/output/metadata.hpp @@ -32,6 +32,7 @@ #include "lib/common/demangle.hpp" #include "lib/common/logging.hpp" #include "lib/common/synchronized.hpp" +#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp" #include #include @@ -74,8 +75,11 @@ using marker_message_ordered_map_t = std::map; using string_entry_map_t = std::unordered_map>; using counter_dimension_vec_t = std::vector; using external_corr_id_set_t = std::unordered_set; -using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; -using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction; +using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; +using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction; +using att_agent_filenames_t = std::pair>; +using att_filenames_map_t = std::unordered_map; +using code_object_load_info_vec_t = std::vector; template using synced_map = common::Synchronized; @@ -94,14 +98,16 @@ struct metadata agent_counter_info_map_t agent_counter_info = {}; agent_pc_sample_config_info_map_t agent_pc_sample_config_info = {}; - sdk::buffer_name_info buffer_names = {}; - sdk::callback_name_info callback_names = {}; - synced_map code_objects = {}; - synced_map kernel_symbols = {}; - synced_map marker_messages = {}; - synced_map string_entries = {}; - synced_map external_corr_ids = {}; - synced_map host_functions = {}; + sdk::buffer_name_info buffer_names = {}; + sdk::callback_name_info callback_names = {}; + synced_map code_objects = {}; + synced_map kernel_symbols = {}; + synced_map marker_messages = {}; + synced_map string_entries = {}; + synced_map external_corr_ids = {}; + synced_map host_functions = {}; + synced_map code_object_load = {}; + att_filenames_map_t att_filenames = {}; metadata() = default; metadata(inprocess); @@ -122,6 +128,7 @@ struct metadata const tool_counter_info* get_counter_info(rocprofiler_counter_id_t id) const; const counter_dimension_info_vec_t* get_counter_dimension_info(uint64_t instance_id) const; + std::vector get_att_filenames() const; code_object_data_vec_t get_code_objects() const; kernel_symbol_data_vec_t get_kernel_symbols() const; host_function_data_vec_t get_host_symbols() const; @@ -135,6 +142,7 @@ struct metadata std::string_view get_comment(int64_t index) const { return instruction_comment.at(index); } int64_t get_instruction_index(rocprofiler_pc_t record); void add_decoder(rocprofiler_code_object_info_t* obj_data_v); + code_object_load_info_vec_t get_code_object_load_info() const; template Tp get_marker_messages(Tp&&); diff --git a/source/lib/rocprofiler-sdk-att/CMakeLists.txt b/source/lib/rocprofiler-sdk-att/CMakeLists.txt index 9066827f..5126235d 100755 --- a/source/lib/rocprofiler-sdk-att/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk-att/CMakeLists.txt @@ -29,6 +29,7 @@ find_package( lib/cmake/amd_comgr) add_library(rocprofiler-sdk-att-parser STATIC) +add_library(rocprofiler-sdk::rocprofiler-sdk-att-parser ALIAS rocprofiler-sdk-att-parser) target_sources(rocprofiler-sdk-att-parser PRIVATE ${ATT_TOOL_SOURCE_FILES}) target_link_libraries( diff --git a/source/lib/rocprofiler-sdk-att/att_lib_wrapper.cpp b/source/lib/rocprofiler-sdk-att/att_lib_wrapper.cpp index d27031ca..7c1e3416 100644 --- a/source/lib/rocprofiler-sdk-att/att_lib_wrapper.cpp +++ b/source/lib/rocprofiler-sdk-att/att_lib_wrapper.cpp @@ -86,8 +86,8 @@ get_shader_id(const std::string& name) auto run_pos = name.rfind('_'); if(run_pos == std::string::npos) throw std::runtime_error("Invalid name"); - std::string_view stripped = name.substr(0, run_pos); - auto se_number_pos = stripped.rfind('_'); + std::string stripped = name.substr(0, run_pos); + auto se_number_pos = stripped.rfind('_'); if(se_number_pos == std::string::npos || se_number_pos + 1 >= stripped.size()) throw std::runtime_error("Invalid name"); @@ -97,15 +97,11 @@ get_shader_id(const std::string& name) std::vector query_att_decode_capability() { - std::vector ret; + auto ret = std::vector{}; for(auto& [cap, libname] : get_lib_names()) { - if(auto handle = dlopen(libname, RTLD_NOW | RTLD_LOCAL)) - { - dlclose(handle); - ret.push_back(cap); - } + if(DL(libname).handle != 0) ret.push_back(cap); } return ret; diff --git a/source/lib/rocprofiler-sdk-att/dl.cpp b/source/lib/rocprofiler-sdk-att/dl.cpp index f08d31ed..9b6a0d2c 100644 --- a/source/lib/rocprofiler-sdk-att/dl.cpp +++ b/source/lib/rocprofiler-sdk-att/dl.cpp @@ -20,12 +20,19 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "dl.hpp" +#include "lib/rocprofiler-sdk-att/dl.hpp" +#include "lib/common/environment.hpp" +#include "lib/common/filesystem.hpp" +#include "lib/common/logging.hpp" + +#include + #include #include #include #include #include +#include #include #include @@ -33,9 +40,26 @@ namespace rocprofiler { namespace att_wrapper { -DL::DL(const char* dlname) +namespace fs = ::rocprofiler::common::filesystem; + +fs::path +get_search_path(std::string path_name) { - handle = dlopen(dlname, RTLD_NOW | RTLD_LOCAL); + if(fs::exists(path_name)) return fs::path(path_name); + return ""; +} + +DL::DL(const char* libname) +{ + auto paths = rocprofiler::common::get_env("ROCPROF_ATT_LIBRARY_PATH", ""); + if(paths.empty()) return; + auto path_set = rocprofiler::sdk::parse::tokenize(paths, ":"); + + for(auto&& name : path_set) + { + handle = dlopen((get_search_path(name) / libname).string().c_str(), RTLD_LAZY | RTLD_LOCAL); + if(handle) break; + } if(!handle) return; att_parse_data_fn = diff --git a/source/lib/rocprofiler-sdk-att/dl.hpp b/source/lib/rocprofiler-sdk-att/dl.hpp index e1d9a94b..e805a11d 100644 --- a/source/lib/rocprofiler-sdk-att/dl.hpp +++ b/source/lib/rocprofiler-sdk-att/dl.hpp @@ -35,7 +35,7 @@ class DL using StatusFn = decltype(rocprofiler_att_decoder_get_status_string); public: - DL(const char* dlname); + DL(const char* libname); ~DL(); ParseFn* att_parse_data_fn = nullptr; diff --git a/source/lib/rocprofiler-sdk-att/tests/CMakeLists.txt b/source/lib/rocprofiler-sdk-att/tests/CMakeLists.txt index 832dc851..68a0ad56 100644 --- a/source/lib/rocprofiler-sdk-att/tests/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk-att/tests/CMakeLists.txt @@ -6,14 +6,15 @@ project(rocprofiler-att-parser-tests LANGUAGES CXX) add_executable(att-parser-tool-v3) target_link_libraries( att-parser-tool-v3 - PRIVATE rocprofiler-sdk-att-parser rocprofiler-sdk::rocprofiler-sdk-json + PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser + rocprofiler-sdk::rocprofiler-sdk-json rocprofiler-sdk::rocprofiler-sdk-common-library) target_sources(att-parser-tool-v3 PRIVATE standalone_tool_main.cpp) add_executable(att-decoder-test) target_link_libraries( att-decoder-test - PRIVATE rocprofiler-sdk-att-parser + PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser rocprofiler-sdk::rocprofiler-sdk-json rocprofiler-sdk::rocprofiler-sdk-common-library rocprofiler-sdk::rocprofiler-sdk-glog @@ -22,6 +23,7 @@ target_link_libraries( GTest::gtest_main) target_sources(att-decoder-test PRIVATE att_decoder_test.cpp) +set(env-att-lib "ROCPROF_ATT_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/../lib") add_library(att_decoder_testing SHARED) target_sources(att_decoder_testing PRIVATE dummy_decoder.cpp) @@ -31,6 +33,7 @@ if(NOT ROCPROFILER_MEMCHECK) SOURCES att_decoder_test.cpp TEST_LIST att-decoder-test_TESTS WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - set_tests_properties(${att-decoder-test_TESTS} PROPERTIES TIMEOUT 10 LABELS - "unittests") + set_tests_properties( + ${att-decoder-test_TESTS} PROPERTIES ENVIRONMENT "${env-att-lib}" TIMEOUT 10 + LABELS "unittests") endif() diff --git a/source/lib/rocprofiler-sdk-att/waitcnt/tests/CMakeLists.txt b/source/lib/rocprofiler-sdk-att/waitcnt/tests/CMakeLists.txt index 372e8786..b2958be3 100644 --- a/source/lib/rocprofiler-sdk-att/waitcnt/tests/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk-att/waitcnt/tests/CMakeLists.txt @@ -6,7 +6,7 @@ project(rocprofiler-att-parser-waitcnt-tests LANGUAGES CXX) add_executable(att-decoder-waitcnt-test) target_link_libraries( att-decoder-waitcnt-test - PRIVATE rocprofiler-sdk-att-parser + PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser rocprofiler-sdk::rocprofiler-sdk-json rocprofiler-sdk::rocprofiler-sdk-common-library rocprofiler-sdk::rocprofiler-sdk-glog diff --git a/source/lib/rocprofiler-sdk-tool/CMakeLists.txt b/source/lib/rocprofiler-sdk-tool/CMakeLists.txt index 264eb310..a22e7b90 100644 --- a/source/lib/rocprofiler-sdk-tool/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk-tool/CMakeLists.txt @@ -23,7 +23,8 @@ target_link_libraries( rocprofiler-sdk::rocprofiler-sdk-perfetto rocprofiler-sdk::rocprofiler-sdk-otf2 rocprofiler-sdk::rocprofiler-sdk-dw - rocprofiler-sdk::rocprofiler-sdk-amd-comgr) + rocprofiler-sdk::rocprofiler-sdk-amd-comgr + rocprofiler-sdk::rocprofiler-sdk-att-parser) set_target_properties( rocprofiler-sdk-tool diff --git a/source/lib/rocprofiler-sdk-tool/config.cpp b/source/lib/rocprofiler-sdk-tool/config.cpp index ba43c49e..c3774619 100644 --- a/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/source/lib/rocprofiler-sdk-tool/config.cpp @@ -144,6 +144,68 @@ get_kernel_filter_range(const std::string& kernel_filter) return range_set; } +std::vector +parse_att_counters(std::string line) +{ + auto counters = std::vector{}; + + if(line.empty()) return counters; + + // strip the comment + if(auto pos = line.find('#'); pos != std::string::npos) line = line.substr(0, pos); + + // trim line for any white spaces after comment strip + trim(line); + + // check to see if comment stripping + trim resulted in empty line + if(line.empty()) return counters; + + handle_special_chars(line); + + auto extract_counter_name_and_simd_mask = [](std::string& input) { + std::string counter_name = ""; + auto ret = att_perfcounter{}; + + size_t pos = input.find(':'); + + if(pos != std::string::npos) + { + ret.counter_name = input.substr(0, pos); + ret.simd_mask = std::stoi(input.substr(pos + 1), nullptr, 16); + } + else + counter_name = input; + return ret; + }; + + // regex to check if string is of the form "counter_name:simd_mask" + std::regex pattern(R"([a-zA-Z0-9_]+(:0x[0-9a-fA-F]+)?)"); + std::set unique_counters; + + auto input_ss = std::stringstream{line}; + while(true) + { + auto counter = std::string{}; + input_ss >> counter; + if(counter.empty()) break; + + // check if the counter string matches the pattern + if(!std::regex_match(counter, pattern)) + { + ROCP_FATAL << "Invalid counter format for ATT: " << counter + << ". Expected format : Counter_name:optional_simd_mask(hexadecimal)"; + } + + // Consider only those counters where combination of counter name and simd mask is unique + if(unique_counters.insert(counter).second == false) continue; + + auto res = extract_counter_name_and_simd_mask(counter); + counters.emplace_back(res); + } + + return counters; +} + std::set parse_counters(std::string line) { @@ -193,6 +255,8 @@ config::config() , kernel_filter_range{get_kernel_filter_range( get_env("ROCPROF_KERNEL_FILTER_RANGE", std::string{}))} , counters{parse_counters(get_env("ROCPROF_COUNTERS", std::string{}))} +, att_param_perfcounters{ + parse_att_counters(get_env("ROCPROF_ATT_PARAM_PERFCOUNTERS", std::string{}))} { if(kernel_filter_include.empty()) kernel_filter_include = std::string{".*"}; diff --git a/source/lib/rocprofiler-sdk-tool/config.hpp b/source/lib/rocprofiler-sdk-tool/config.hpp index 0a6aaa5d..989f4c73 100644 --- a/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/source/lib/rocprofiler-sdk-tool/config.hpp @@ -29,6 +29,7 @@ #include "lib/common/units.hpp" #include "lib/output/format_path.hpp" #include "lib/output/output_config.hpp" +#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp" #include @@ -63,6 +64,12 @@ get_config(); std::string format_name(std::string_view _name, const config& = get_config<>()); +struct att_perfcounter +{ + std::string counter_name = {}; + uint32_t simd_mask = 0xf; +}; + struct config : output_config { using base_type = output_config; @@ -101,13 +108,20 @@ struct config : output_config bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); bool pc_sampling_host_trap = false; + bool advanced_thread_trace = get_env("ROCPROF_ADVANCED_THREAD_TRACE", false); size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1); + bool att_serialize_all = get_env("ROCPROF_ATT_PARAM_SERIALIZE_ALL", false); rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE; rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE; std::string stats_summary_unit = get_env("ROCPROF_STATS_SUMMARY_UNITS", "nsec"); int mpi_size = get_mpi_size(); int mpi_rank = get_mpi_rank(); + uint64_t att_param_shader_engine_mask = + get_env("ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", 0x1); + uint64_t att_param_buffer_size = get_env("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x6000000); + uint64_t att_param_simd_select = get_env("ROCPROF_ATT_PARAM_SIMD_SELECT", 0xF); + uint64_t att_param_target_cu = get_env("ROCPROF_ATT_PARAM_TARGET_CU", 1); std::string kernel_filter_include = get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", ".*"); std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", ""); @@ -115,8 +129,10 @@ struct config : output_config std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none"); std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", ""); - std::unordered_set kernel_filter_range = {}; - std::set counters = {}; + std::unordered_set kernel_filter_range = {}; + std::set counters = {}; + std::string att_capability = get_env("ROCPROF_ATT_CAPABILITY", ""); + std::vector att_param_perfcounters = {}; std::queue collection_periods = {}; diff --git a/source/lib/rocprofiler-sdk-tool/tool.cpp b/source/lib/rocprofiler-sdk-tool/tool.cpp index 9c0ca3ae..815268d1 100644 --- a/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -45,8 +45,11 @@ #include "lib/output/statistics.hpp" #include "lib/output/tmp_file.hpp" #include "lib/output/tmp_file_buffer.hpp" +#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp" #include +#include +#include #include #include #include @@ -193,9 +196,10 @@ using kernel_iteration_t = std::unordered_map; using kernel_rename_stack_t = std::stack; -auto* tool_metadata = as_pointer(tool::metadata::inprocess{}); -auto target_kernels = common::Synchronized{}; -auto kernel_iteration = common::Synchronized{}; +auto* tool_metadata = as_pointer(tool::metadata::inprocess{}); +auto target_kernels = common::Synchronized{}; +auto kernel_iteration = common::Synchronized{}; +std::mutex att_shader_data; thread_local auto thread_dispatch_rename = as_pointer(); thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() { @@ -238,7 +242,14 @@ is_targeted_kernel(uint64_t _kern_id) // If the iteration range is not given then all iterations of the kernel is profiled if(_range.empty()) - return true; + { + if(!tool::get_config().advanced_thread_trace) + return true; + else + { + if(itr == 1) return true; + } + } else if(_range.find(itr) != _range.end()) return true; return false; @@ -607,6 +618,75 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, { CHECK_NOTNULL(tool_metadata)->add_decoder(obj_data); } + + if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY && + tool::get_config().advanced_thread_trace) + { + const char* gpu_name = tool_metadata->agents_map.at(obj_data->rocp_agent).name; + auto filename = fmt::format("{}_code_object_id_{}", + std::string(gpu_name), + std::to_string(obj_data->code_object_id)); + auto output_stream = get_output_stream(tool::get_config(), filename, ".out"); + std::string output_filename = + get_output_filename(tool::get_config(), filename, ".out"); + + // NOLINTNEXTLINE(performance-no-int-to-ptr) + output_stream.stream->write(reinterpret_cast(obj_data->memory_base), + obj_data->memory_size); + tool_metadata->code_object_load.wlock( + [](auto& data_vec, + std::string file_name, + tool::rocprofiler_code_object_info_t* obj_data_v) { + data_vec.push_back({file_name, + obj_data_v->code_object_id, + obj_data_v->load_base, + obj_data_v->load_size}); + }, + output_filename, + obj_data); + } + else if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE && + tool::get_config().advanced_thread_trace) + { + const char* gpu_name = tool_metadata->agents_map.at(obj_data->rocp_agent).name; + auto filename = fmt::format("{}_code_object_id_{}", + std::string(gpu_name), + std::to_string(obj_data->code_object_id)); + auto output_stream = get_output_stream(tool::get_config(), filename, ".out"); + std::string output_filename = + get_output_filename(tool::get_config(), filename, ".out"); + + uint8_t* binary = nullptr; + size_t buffer_size = 0; + std::ifstream code_object_file(obj_data->uri, std::ios::binary | std::ios::ate); + if(code_object_file.good()) + { + buffer_size = code_object_file.tellg(); + code_object_file.seekg(0, std::ios::beg); + binary = new(std::nothrow) uint8_t[buffer_size]; + if(binary && + !code_object_file.read(reinterpret_cast(binary), buffer_size)) + { + delete[] binary; + binary = nullptr; + } + } + // NOLINTBEGIN(performance-no-int-to-ptr) + output_stream.stream->write(reinterpret_cast(obj_data->memory_base), + obj_data->memory_size); + // NOLINTEND(performance-no-int-to-ptr) + tool_metadata->code_object_load.wlock( + [](auto& data_vec, + std::string file_name, + tool::rocprofiler_code_object_info_t* obj_data_v) { + data_vec.push_back({file_name, + obj_data_v->code_object_id, + obj_data_v->load_base, + obj_data_v->load_size}); + }, + output_filename, + obj_data); + } } else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { @@ -879,6 +959,38 @@ get_instruction_index(rocprofiler_pc_t pc) } // namespace +std::vector +get_att_perfcounter_params(std::vector& att_perf_counters) +{ + std::vector _data; + if(att_perf_counters.empty()) return _data; + + static const auto gpu_agents = get_gpu_agents(); + static const auto gpu_agents_counter_info = get_agent_counter_info(); + + for(const auto& [agent_, tool_counter_info_] : gpu_agents_counter_info) + { + for(const auto& counter_info_ : tool_counter_info_) + { + if(std::string_view(counter_info_.block) != "SQ") continue; + + for(const auto& att_perf_counter : att_perf_counters) + { + if(std::string_view(counter_info_.name) == att_perf_counter.counter_name) + { + auto param = rocprofiler_att_parameter_t{}; + param.type = ROCPROFILER_ATT_PARAMETER_PERFCOUNTER, + param.counter_id = counter_info_.id, + param.simd_mask = att_perf_counter.simd_mask; + _data.emplace_back(param); + } + } + } + } + + return _data; +} + void rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, rocprofiler_buffer_id_t /* buffer_id*/, @@ -920,6 +1032,55 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, } } +void +att_shader_data_callback(rocprofiler_agent_id_t agent, + int64_t se_id, + void* se_data, + size_t data_size, + rocprofiler_user_data_t userdata) +{ + std::lock_guard lock(att_shader_data); + std::stringstream filename; + filename << fmt::format("{}_shader_engine_{}_{}", agent.handle, se_id, userdata.value); + + auto dispatch_id = static_cast(userdata.value); + auto output_stream = get_output_stream(tool::get_config(), filename.str(), ".att"); + std::string output_filename = get_output_filename(tool::get_config(), filename.str(), ".att"); + + output_stream.stream->write(reinterpret_cast(se_data), data_size); + tool_metadata->att_filenames[dispatch_id].first = agent; + tool_metadata->att_filenames[dispatch_id].second.emplace_back(output_filename); +} + +rocprofiler_att_control_flags_t +att_dispatch_callback(rocprofiler_agent_id_t /* agent_id */, + rocprofiler_queue_id_t /* queue_id */, + rocprofiler_correlation_id_t /* correlation_id */, + rocprofiler_kernel_id_t kernel_id, + rocprofiler_dispatch_id_t dispatch_id, + void* /*userdata_config*/, + rocprofiler_user_data_t* userdata_shader) +{ + userdata_shader->value = dispatch_id; + kernel_iteration.wlock( + [](auto& _kernel_iter, rocprofiler_kernel_id_t _kernel_id) { + auto itr = _kernel_iter.find(_kernel_id); + if(itr == _kernel_iter.end()) + _kernel_iter.emplace(_kernel_id, 1); + else + { + itr->second++; + } + }, + kernel_id); + + if(is_targeted_kernel(kernel_id)) + { + return ROCPROFILER_ATT_CONTROL_START_AND_STOP; + } + return ROCPROFILER_ATT_CONTROL_NONE; +} + void dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data, rocprofiler_profile_config_id_t* config, @@ -1214,6 +1375,37 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) } } + if(tool::get_config().advanced_thread_trace) + { + auto parameters = std::vector{}; + uint64_t target_cu = tool::get_config().att_param_target_cu; + uint64_t simd_select = tool::get_config().att_param_simd_select; + uint64_t buffer_sz = tool::get_config().att_param_buffer_size; + uint64_t shader_mask = tool::get_config().att_param_shader_engine_mask; + auto& att_perf = tool::get_config().att_param_perfcounters; + bool att_serialize_all = tool::get_config().att_serialize_all; + auto att_perf_params = get_att_perfcounter_params(att_perf); + parameters.insert(parameters.end(), att_perf_params.begin(), att_perf_params.end()); + + // TODO: att params could be different for different devices. How to support? + // Input file schema might also need to change to support multiple ATT params + + parameters.push_back({ROCPROFILER_ATT_PARAMETER_TARGET_CU, {target_cu}}); + parameters.push_back({ROCPROFILER_ATT_PARAMETER_SIMD_SELECT, {simd_select}}); + parameters.push_back({ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE, {buffer_sz}}); + parameters.push_back({ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK, {shader_mask}}); + parameters.push_back({ROCPROFILER_ATT_PARAMETER_SERIALIZE_ALL, {att_serialize_all}}); + + ROCPROFILER_CALL( + rocprofiler_configure_dispatch_thread_trace_service(get_client_ctx(), + parameters.data(), + parameters.size(), + att_dispatch_callback, + att_shader_data_callback, + tool_data), + "thread trace service configure"); + } + if(tool::get_config().hip_runtime_api_trace || tool::get_config().hip_compiler_api_trace) { ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), @@ -1504,6 +1696,40 @@ tool_fini(void* /*tool_data*/) tool::generate_csv(tool::get_config(), *tool_metadata, contributions); } + if(tool::get_config().advanced_thread_trace) + { + std::unordered_map + tool_att_capability_map = { + {"testing", rocprofiler::att_wrapper::ATT_CAPABILITIES_TESTING}, + {"summary", rocprofiler::att_wrapper::ATT_CAPABILITIES_SUMMARY}, + {"trace", rocprofiler::att_wrapper::ATT_CAPABILITIES_TRACE}, + {"debug", rocprofiler::att_wrapper::ATT_CAPABILITIES_DEBUG}}; + + ROCP_FATAL_IF(tool::get_config().att_capability.empty()) + << "Provide the decoder parser method as input"; + + auto att_capability_value = tool_att_capability_map.at(tool::get_config().att_capability); + auto decoder = rocprofiler::att_wrapper::ATTDecoder(att_capability_value); + ROCP_FATAL_IF(!decoder.valid()) << "Decoder library not found at ROCPORF_ATT_LIBRARY_PATH"; + auto codeobj = tool_metadata->get_code_object_load_info(); + auto output_path = tool::format_path(tool::get_config().output_path); + for(auto& [dispatch_id, att_filename_data] : tool_metadata->att_filenames) + { + std::string formats = "json,csv"; + // if(tool::get_config().json_output) formats += "json,"; + // if(tool::get_config().csv_output) formats += "csv,"; + + std::stringstream ui_name; + ui_name << fmt::format("ui_output_agent_{}_dispatch_{}", + std::to_string(att_filename_data.first.handle), + dispatch_id); + auto out_path = fmt::format("{}/{}", output_path, ui_name.str()); + std::string in_path = "."; + + decoder.parse(in_path, out_path, att_filename_data.second, codeobj, formats); + } + } + if(tool::get_config().json_output) { auto json_ar = tool::open_json(tool::get_config()); diff --git a/tests/bin/CMakeLists.txt b/tests/bin/CMakeLists.txt index c065303c..5d89715b 100644 --- a/tests/bin/CMakeLists.txt +++ b/tests/bin/CMakeLists.txt @@ -32,3 +32,4 @@ add_subdirectory(pc-sampling) if(ROCPROFILER_BUILD_ROCDECODE_TESTS) add_subdirectory(rocdecode) endif() +add_subdirectory(hsa-code-object) diff --git a/tests/bin/hsa-code-object/CMakeLists.txt b/tests/bin/hsa-code-object/CMakeLists.txt new file mode 100644 index 00000000..a19e77c9 --- /dev/null +++ b/tests/bin/hsa-code-object/CMakeLists.txt @@ -0,0 +1,90 @@ +# +# +# HSA multi-queue dependency test + +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project(rocprofiler-tests-bin-hsa-code-object LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +find_program( + amdclangpp_EXECUTABLE REQUIRED + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + +function(generate_hsaco TARGET_ID INPUT_FILE OUTPUT_FILE) + separate_arguments( + CLANG_ARG_LIST + UNIX_COMMAND + "-O2 -x cl -Xclang -finclude-default-header -cl-denorms-are-zero -cl-std=CL2.0 -Wl,--build-id=sha1 + -target amdgcn-amd-amdhsa -mcpu=${TARGET_ID} -o ${OUTPUT_FILE} ${INPUT_FILE}") + add_custom_command( + OUTPUT ${PROJECT_BINARY_DIR}/${OUTPUT_FILE} + COMMAND ${amdclangpp_EXECUTABLE} ${CLANG_ARG_LIST} + COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/${OUTPUT_FILE} + ${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE} + OUTPUT ${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE} + COMMAND + ${CMAKE_COMMAND} -E copy + ${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE} + ${CMAKE_BINARY_DIR}/rocprofv3/advanced-thread-trace/${OUTPUT_FILE} + COMMENT "Building ${OUTPUT_FILE}...") + set(HSACO_TARGET_LIST + ${HSACO_TARGET_LIST} ${PROJECT_BINARY_DIR}/${OUTPUT_FILE} + PARENT_SCOPE) +endfunction(generate_hsaco) + +foreach(target_id ${GPU_TARGETS}) + # generate kernel bitcodes + generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/copy.cl + ${target_id}_copy.hsaco) + generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/copy_memory.cl + ${target_id}_copy_memory.hsaco) +endforeach() + +add_custom_target(generate_hsaco_targets_code_object DEPENDS ${HSACO_TARGET_LIST}) + +add_executable(hsa_code_object_testapp) +target_sources(hsa_code_object_testapp PRIVATE hsa_code_object_app.cpp) +target_compile_options(hsa_code_object_testapp PRIVATE -W -Wall -Wextra -Wshadow -Werror) + +find_package(Threads REQUIRED) +target_link_libraries(hsa_code_object_testapp PRIVATE stdc++fs Threads::Threads) + +find_package( + amd_comgr + REQUIRED + CONFIG + HINTS + ${CMAKE_INSTALL_PREFIX} + PATHS + ${ROCM_PATH} + PATH_SUFFIXES + lib/cmake/amd_comgr) + +target_link_libraries(hsa_code_object_testapp PRIVATE amd_comgr) + +find_package(rocprofiler-sdk REQUIRED) +target_link_libraries( + hsa_code_object_testapp PRIVATE rocprofiler-sdk::rocprofiler-sdk + rocprofiler-sdk::tests-common-library) + +find_package( + hsa-runtime64 + REQUIRED + CONFIG + HINTS + ${rocm_version_DIR} + ${ROCM_PATH} + PATHS + ${rocm_version_DIR} + ${ROCM_PATH}) + +target_link_libraries(hsa_code_object_testapp PRIVATE hsa-runtime64::hsa-runtime64) + +add_dependencies(hsa_code_object_testapp generate_hsaco_targets_code_object) diff --git a/tests/bin/hsa-code-object/copy.cl b/tests/bin/hsa-code-object/copy.cl new file mode 100644 index 00000000..eadc65f1 --- /dev/null +++ b/tests/bin/hsa-code-object/copy.cl @@ -0,0 +1,32 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +__kernel void copyA(__global unsigned int* a, __global unsigned int* b) { + uint tid = get_global_id(0); + a[tid] = b[tid]; +} +__kernel void copyB(__global unsigned int* a, __global unsigned int* b) { + uint tid = get_global_id(0); + a[tid] = b[tid]; +} +__kernel void copyC(__global unsigned int* a, __global unsigned int* b) { + uint tid = get_global_id(0); + a[tid] = b[tid]; +} diff --git a/tests/bin/hsa-code-object/copy_memory.cl b/tests/bin/hsa-code-object/copy_memory.cl new file mode 100644 index 00000000..09cabd0a --- /dev/null +++ b/tests/bin/hsa-code-object/copy_memory.cl @@ -0,0 +1,32 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +__kernel void copyD(__global unsigned int* a, __global unsigned int* b) { + uint tid = get_global_id(0); + a[tid] = b[tid]; +} +__kernel void copyE(__global unsigned int* a, __global unsigned int* b) { + uint tid = get_global_id(0); + a[tid] = b[tid]; +} +__kernel void copyF(__global unsigned int* a, __global unsigned int* b) { + uint tid = get_global_id(0); + a[tid] = b[tid]; +} diff --git a/tests/bin/hsa-code-object/hsa_code_object_app.cpp b/tests/bin/hsa-code-object/hsa_code_object_app.cpp new file mode 100644 index 00000000..19ce108a --- /dev/null +++ b/tests/bin/hsa-code-object/hsa_code_object_app.cpp @@ -0,0 +1,492 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/** ROC Profiler Multi Queue Dependency Test + * + * The goal of this test is to ensure ROC profiler does not go to deadlock + * when multiple queue are created and they are dependent on each other + * + */ + +#include "hsa_code_object_app.h" + +enum class storage_type +{ + CODE_OBJECT_STORAGE_FILE, + CODE_OBJECT_STORAGE_MEMORY +}; + +void +code_object_load(MQDependencyTest& obj, + storage_type type, + MQDependencyTest::CodeObject& code_object) +{ + hsa_status_t status; + obj.device_discovery(); + char agent_name[64]; + status = hsa_agent_get_info(obj.gpu[0].agent, HSA_AGENT_INFO_NAME, agent_name); + RET_IF_HSA_ERR(status) + + if(type == storage_type::CODE_OBJECT_STORAGE_FILE) + { + std::string hasco_file_path = std::string(agent_name) + std::string("_copy.hsaco"); + obj.search_hasco(fs::current_path(), hasco_file_path); + if(!obj.load_code_object(hasco_file_path, obj.gpu[0].agent, code_object)) + { + printf("Kernel file not found or not usable with given agent.\n"); + abort(); + } + } + else + { + std::string hasco_file_path = std::string(agent_name) + std::string("_copy_memory.hsaco"); + obj.search_hasco(fs::current_path(), hasco_file_path); + if(!obj.load_code_object_memory(hasco_file_path, obj.gpu[0].agent, code_object)) + { + abort(); + } + } +} + +MQDependencyTest::Kernel +get_kernel(MQDependencyTest::CodeObject& code_object, + std::string kernel_name, + MQDependencyTest& obj) +{ + MQDependencyTest::Kernel copy; + if(!obj.get_kernel(code_object, kernel_name, obj.gpu[0].agent, copy)) + { + printf("Test %s not found.\n", kernel_name.c_str()); + abort(); + } + return copy; +} + +int +main() +{ + hsa_status_t status; + MQDependencyTest obj; + MQDependencyTest obj_memory = {}; + MQDependencyTest::CodeObject code_object = {}, code_object_memory = {}; + + code_object_load(obj, storage_type::CODE_OBJECT_STORAGE_FILE, code_object); + code_object_load(obj_memory, storage_type::CODE_OBJECT_STORAGE_MEMORY, code_object_memory); + + MQDependencyTest::Kernel copyA = get_kernel(code_object, "copyA", obj); + MQDependencyTest::Kernel copyB = get_kernel(code_object, "copyB", obj); + MQDependencyTest::Kernel copyC = get_kernel(code_object, "copyC", obj); + + MQDependencyTest::Kernel copyD = get_kernel(code_object_memory, "copyD", obj_memory); + MQDependencyTest::Kernel copyE = get_kernel(code_object_memory, "copyE", obj_memory); + MQDependencyTest::Kernel copyF = get_kernel(code_object_memory, "copyF", obj_memory); + + struct args_t + { + uint32_t* a = nullptr; + uint32_t* b = nullptr; + MQDependencyTest::OCLHiddenArgs hidden = {}; + }; + + args_t* args = static_cast(obj.hsa_malloc(sizeof(args_t), obj.kernarg)); + *args = {}; + + uint32_t* a = static_cast(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg)); + uint32_t* b = static_cast(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg)); + + memset(a, 0, 64 * sizeof(uint32_t)); + memset(b, 1, 64 * sizeof(uint32_t)); + + args_t* args_memory = + static_cast(obj_memory.hsa_malloc(sizeof(args_t), obj_memory.kernarg)); + *args_memory = {}; + + uint32_t* c = + static_cast(obj_memory.hsa_malloc(64 * sizeof(uint32_t), obj_memory.kernarg)); + uint32_t* d = + static_cast(obj_memory.hsa_malloc(64 * sizeof(uint32_t), obj_memory.kernarg)); + + memset(c, 0, 64 * sizeof(uint32_t)); + memset(d, 1, 64 * sizeof(uint32_t)); + + // Create queue in gpu agent and prepare a kernel dispatch packet + hsa_queue_t* queue1 = nullptr; + status = hsa_queue_create(obj.gpu[0].agent, + 1024, + HSA_QUEUE_TYPE_SINGLE, + nullptr, + nullptr, + UINT32_MAX, + UINT32_MAX, + &queue1); + RET_IF_HSA_ERR(status) + + // Create a signal with a value of 1 and attach it to the first kernel + // dispatch packet + hsa_signal_t completion_signal_1 = {}; + status = hsa_signal_create(1, 0, nullptr, &completion_signal_1); + RET_IF_HSA_ERR(status) + + // First dispath packet on queue 1, Kernel A + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.dispatch.setup = 1; + packet.dispatch.workgroup_size_x = 64; + packet.dispatch.workgroup_size_y = 1; + packet.dispatch.workgroup_size_z = 1; + packet.dispatch.grid_size_x = 64; + packet.dispatch.grid_size_y = 1; + packet.dispatch.grid_size_z = 1; + + packet.dispatch.group_segment_size = copyA.group; + packet.dispatch.private_segment_size = copyA.scratch; + packet.dispatch.kernel_object = copyA.handle; + + packet.dispatch.kernarg_address = args; + packet.dispatch.completion_signal = completion_signal_1; + + args->a = a; + args->b = b; + // Tell packet processor of A to launch the first kernel dispatch packet + obj.submit_packet(queue1, packet); + } + + // Create a signal with a value of 1 and attach it to the second kernel + // dispatch packet + hsa_signal_t completion_signal_2 = {}; + status = hsa_signal_create(1, 0, nullptr, &completion_signal_2); + RET_IF_HSA_ERR(status) + + hsa_signal_t completion_signal_3 = {}; + status = hsa_signal_create(1, 0, nullptr, &completion_signal_3); + RET_IF_HSA_ERR(status) + + // Create barrier-AND packet that is enqueued in queue 1 + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.barrier_and.dep_signal[0] = completion_signal_2; + obj.submit_packet(queue1, packet); + } + + // Second dispath packet on queue 1, Kernel C + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.dispatch.setup = 1; + packet.dispatch.workgroup_size_x = 64; + packet.dispatch.workgroup_size_y = 1; + packet.dispatch.workgroup_size_z = 1; + packet.dispatch.grid_size_x = 64; + packet.dispatch.grid_size_y = 1; + packet.dispatch.grid_size_z = 1; + + packet.dispatch.group_segment_size = copyC.group; + packet.dispatch.private_segment_size = copyC.scratch; + packet.dispatch.kernel_object = copyC.handle; + packet.dispatch.completion_signal = completion_signal_3; + packet.dispatch.kernarg_address = args; + + args->a = a; + args->b = b; + // Tell packet processor to launch the second kernel dispatch packet + obj.submit_packet(queue1, packet); + } + + // Create queue 2 + hsa_queue_t* queue2 = nullptr; + status = hsa_queue_create(obj.gpu[0].agent, + 1024, + HSA_QUEUE_TYPE_SINGLE, + nullptr, + nullptr, + UINT32_MAX, + UINT32_MAX, + &queue2); + RET_IF_HSA_ERR(status) + + // Create barrier-AND packet that is enqueued in queue 2 + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.barrier_and.dep_signal[0] = completion_signal_1; + obj.submit_packet(queue2, packet); + } + + // Third dispath packet on queue 2, Kernel B + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.dispatch.setup = 1; + packet.dispatch.workgroup_size_x = 64; + packet.dispatch.workgroup_size_y = 1; + packet.dispatch.workgroup_size_z = 1; + packet.dispatch.grid_size_x = 64; + packet.dispatch.grid_size_y = 1; + packet.dispatch.grid_size_z = 1; + + packet.dispatch.group_segment_size = copyB.group; + packet.dispatch.private_segment_size = copyB.scratch; + packet.dispatch.kernel_object = copyB.handle; + + packet.dispatch.kernarg_address = args; + packet.dispatch.completion_signal = completion_signal_2; + + args->a = a; + args->b = b; + // Tell packet processor to launch the third kernel dispatch packet + obj.submit_packet(queue2, packet); + } + // Create a signal with a value of 1 and attach it to the first kernel + // dispatch packet + hsa_signal_t completion_signal_4 = {}; + status = hsa_signal_create(1, 0, nullptr, &completion_signal_4); + RET_IF_HSA_ERR(status) + // First dispath packet on queue 1, Kernel D + { + [[maybe_unused]] MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.dispatch.setup = 1; + packet.dispatch.workgroup_size_x = 64; + packet.dispatch.workgroup_size_y = 1; + packet.dispatch.workgroup_size_z = 1; + packet.dispatch.grid_size_x = 64; + packet.dispatch.grid_size_y = 1; + packet.dispatch.grid_size_z = 1; + + packet.dispatch.group_segment_size = copyD.group; + packet.dispatch.private_segment_size = copyD.scratch; + packet.dispatch.kernel_object = copyD.handle; + + packet.dispatch.kernarg_address = args_memory; + packet.dispatch.completion_signal = completion_signal_4; + + args_memory->a = c; + args_memory->b = d; + // Tell packet processor of A to launch the first kernel dispatch packet + obj_memory.submit_packet(queue1, packet); + } + + // Create a signal with a value of 1 and attach it to the second kernel + // dispatch packet + hsa_signal_t completion_signal_5 = {}; + status = hsa_signal_create(1, 0, nullptr, &completion_signal_5); + RET_IF_HSA_ERR(status) + hsa_signal_t completion_signal_6 = {}; + status = hsa_signal_create(1, 0, nullptr, &completion_signal_6); + RET_IF_HSA_ERR(status) + + // Create barrier-AND packet that is enqueued in queue 1 + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.barrier_and.dep_signal[0] = completion_signal_5; + obj_memory.submit_packet(queue1, packet); + } + // Second dispath packet on queue 1, Kernel F + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.dispatch.setup = 1; + packet.dispatch.workgroup_size_x = 64; + packet.dispatch.workgroup_size_y = 1; + packet.dispatch.workgroup_size_z = 1; + packet.dispatch.grid_size_x = 64; + packet.dispatch.grid_size_y = 1; + packet.dispatch.grid_size_z = 1; + + packet.dispatch.group_segment_size = copyF.group; + packet.dispatch.private_segment_size = copyF.scratch; + packet.dispatch.kernel_object = copyF.handle; + packet.dispatch.completion_signal = completion_signal_6; + packet.dispatch.kernarg_address = args_memory; + + args_memory->a = c; + args_memory->b = d; + // Tell packet processor to launch the second kernel dispatch packet + obj_memory.submit_packet(queue1, packet); + } + // Create barrier-AND packet that is enqueued in queue 2 + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.barrier_and.dep_signal[0] = completion_signal_4; + obj_memory.submit_packet(queue2, packet); + } + // Third dispath packet on queue 2, Kernel + { + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.dispatch.setup = 1; + packet.dispatch.workgroup_size_x = 64; + packet.dispatch.workgroup_size_y = 1; + packet.dispatch.workgroup_size_z = 1; + packet.dispatch.grid_size_x = 64; + packet.dispatch.grid_size_y = 1; + packet.dispatch.grid_size_z = 1; + + packet.dispatch.group_segment_size = copyE.group; + packet.dispatch.private_segment_size = copyE.scratch; + packet.dispatch.kernel_object = copyE.handle; + + packet.dispatch.kernarg_address = args_memory; + packet.dispatch.completion_signal = completion_signal_5; + + args_memory->a = c; + args_memory->b = d; + // Tell packet processor to launch the third kernel dispatch packet + obj_memory.submit_packet(queue2, packet); + } + // Wait on the completion signal + hsa_signal_wait_relaxed( + completion_signal_1, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + // Wait on the completion signal + hsa_signal_wait_relaxed( + completion_signal_2, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + // Wait on the completion signal + hsa_signal_wait_relaxed( + completion_signal_3, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + // Wait on the completion signal + hsa_signal_wait_relaxed( + completion_signal_4, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + // Wait on the completion signal + hsa_signal_wait_relaxed( + completion_signal_5, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + // Wait on the completion signal + hsa_signal_wait_relaxed( + completion_signal_6, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + for(int i = 0; i < 64; i++) + { + if(a[i] != b[i]) + { + printf("error at %d: expected %d, got %d\n", i, b[i], a[i]); + abort(); + } + } + + // Clearing data structures and memory + status = hsa_signal_destroy(completion_signal_1); + RET_IF_HSA_ERR(status) + + status = hsa_signal_destroy(completion_signal_2); + RET_IF_HSA_ERR(status) + + status = hsa_signal_destroy(completion_signal_3); + RET_IF_HSA_ERR(status) + + // Clearing data structures and memory + status = hsa_signal_destroy(completion_signal_4); + RET_IF_HSA_ERR(status) + + status = hsa_signal_destroy(completion_signal_5); + RET_IF_HSA_ERR(status) + + status = hsa_signal_destroy(completion_signal_6); + RET_IF_HSA_ERR(status) + + if(queue1 != nullptr) + { + status = hsa_queue_destroy(queue1); + RET_IF_HSA_ERR(status) + } + + if(queue2 != nullptr) + { + status = hsa_queue_destroy(queue2); + RET_IF_HSA_ERR(status) + } + + status = hsa_memory_free(a); + RET_IF_HSA_ERR(status) + + status = hsa_memory_free(b); + RET_IF_HSA_ERR(status) + + status = hsa_memory_free(c); + RET_IF_HSA_ERR(status) + + status = hsa_memory_free(d); + RET_IF_HSA_ERR(status) + + status = hsa_executable_destroy(code_object.executable); + RET_IF_HSA_ERR(status) + + status = hsa_code_object_reader_destroy(code_object.code_obj_rdr); + RET_IF_HSA_ERR(status) + + status = hsa_executable_destroy(code_object_memory.executable); + RET_IF_HSA_ERR(status) + + status = hsa_code_object_reader_destroy(code_object_memory.code_obj_rdr); + RET_IF_HSA_ERR(status) + + close(code_object.file); + + close(code_object_memory.file); +} diff --git a/tests/bin/hsa-code-object/hsa_code_object_app.h b/tests/bin/hsa-code-object/hsa_code_object_app.h new file mode 100644 index 00000000..2ba79f6c --- /dev/null +++ b/tests/bin/hsa-code-object/hsa_code_object_app.h @@ -0,0 +1,415 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "common/filesystem.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace fs = common::fs; + +#define RET_IF_HSA_ERR(err) \ + { \ + if((err) != HSA_STATUS_SUCCESS) \ + { \ + char err_val[12]; \ + char* err_str = nullptr; \ + if(hsa_status_string(err, (const char**) &err_str) != HSA_STATUS_SUCCESS) \ + { \ + sprintf(&(err_val[0]), "%#x", (uint32_t) err); \ + err_str = &(err_val[0]); \ + } \ + printf("hsa api call failure at: %s:%d\n", __FILE__, __LINE__); \ + printf("Call returned %s\n", err_str); \ + abort(); \ + } \ + } + +struct Device +{ + struct Memory + { + hsa_amd_memory_pool_t pool; + bool fine; + bool kernarg; + size_t size; + size_t granule; + }; + + hsa_agent_t agent; + char name[64]; + std::vector pools; + uint32_t fine; + uint32_t coarse; + static std::vector all_devices; +}; + +class MQDependencyTest +{ +public: + MQDependencyTest() { hsa_init(); } + ~MQDependencyTest() { hsa_shut_down(); } + + std::vector cpu; + std::vector gpu; + Device::Memory kernarg; + std::vector all_devices = {}; + + struct CodeObject + { + hsa_file_t file = 0; + hsa_code_object_reader_t code_obj_rdr = {}; + hsa_executable_t executable = {}; + }; + + struct Kernel + { + uint64_t handle = 0; + uint32_t scratch = 0; + uint32_t group = 0; + uint32_t kernarg_size = 0; + uint32_t kernarg_align = 0; + }; + + union AqlHeader + { + struct + { + uint16_t type : 8; + uint16_t barrier : 1; + uint16_t acquire : 2; + uint16_t release : 2; + uint16_t reserved : 3; + }; + uint16_t raw = 0; + }; + + struct BarrierValue + { + AqlHeader header = {}; + uint8_t AmdFormat = 0; + uint8_t reserved = 0; + uint32_t reserved1 = 0; + hsa_signal_t signal = {}; + hsa_signal_value_t value = 0; + hsa_signal_value_t mask = 0; + uint32_t cond = 0; + uint32_t reserved2 = 0; + uint64_t reserved3 = 0; + uint64_t reserved4 = 0; + hsa_signal_t completion_signal = {}; + }; + + union Aql + { + AqlHeader header; + hsa_kernel_dispatch_packet_t dispatch; + hsa_barrier_and_packet_t barrier_and; + hsa_barrier_or_packet_t barrier_or; + BarrierValue barrier_value = {}; + }; + + struct OCLHiddenArgs + { + uint64_t offset_x = 0; + uint64_t offset_y = 0; + uint64_t offset_z = 0; + void* printf_buffer = nullptr; + void* enqueue = nullptr; + void* enqueue2 = nullptr; + void* multi_grid = nullptr; + }; + + bool load_code_object(const std::string& filename, hsa_agent_t agent, CodeObject& code_object) + { + hsa_status_t err; + code_object.file = open(filename.c_str(), O_RDONLY); + if(code_object.file == -1) + { + fprintf(stderr, "%s:%s\n", "Could not load code object", filename.c_str()); + abort(); + return false; + } + + err = hsa_code_object_reader_create_from_file(code_object.file, &code_object.code_obj_rdr); + RET_IF_HSA_ERR(err); + + err = hsa_executable_create_alt(HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &code_object.executable); + RET_IF_HSA_ERR(err); + + err = hsa_executable_load_agent_code_object( + code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr); + if(err != HSA_STATUS_SUCCESS) return false; + + err = hsa_executable_freeze(code_object.executable, nullptr); + RET_IF_HSA_ERR(err); + + return true; + } + + bool load_code_object_memory(const std::string& filename, + hsa_agent_t agent, + CodeObject& code_object) + { + hsa_status_t err; + size_t buffer_size = 0; + std::ifstream code_object_file(filename.c_str(), std::ios::binary | std::ios::ate); + if(!code_object_file.good()) return false; + buffer_size = code_object_file.tellg(); + code_object_file.seekg(0, std::ios::beg); + uint8_t* binary = new(std::nothrow) uint8_t[buffer_size]; + if(binary && !code_object_file.read(reinterpret_cast(binary), buffer_size)) + { + delete[] binary; + binary = nullptr; + return false; + } + + err = hsa_code_object_reader_create_from_memory( + binary, buffer_size, &code_object.code_obj_rdr); + RET_IF_HSA_ERR(err); + + err = hsa_executable_create_alt(HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, + &code_object.executable); + RET_IF_HSA_ERR(err); + + err = hsa_executable_load_agent_code_object( + code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr); + if(err != HSA_STATUS_SUCCESS) return false; + + err = hsa_executable_freeze(code_object.executable, nullptr); + RET_IF_HSA_ERR(err); + delete[] binary; + binary = nullptr; + return true; + } + + bool get_kernel(const CodeObject& code_object, + const std::string& kernel, + hsa_agent_t agent, + Kernel& kern) + { + hsa_executable_symbol_t symbol; + hsa_status_t err = hsa_executable_get_symbol_by_name( + code_object.executable, kernel.c_str(), &agent, &symbol); + if(err != HSA_STATUS_SUCCESS) + { + err = hsa_executable_get_symbol_by_name( + code_object.executable, (kernel + ".kd").c_str(), &agent, &symbol); + if(err != HSA_STATUS_SUCCESS) + { + return false; + } + } + printf("\nkernel-name: %s\n", kernel.c_str()); + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kern.handle); + RET_IF_HSA_ERR(err); + + return true; + } + + // Not for parallel insertion. + bool submit_packet(hsa_queue_t* queue, Aql& pkt) + { + size_t mask = queue->size - 1; + Aql* ring = static_cast(queue->base_address); + + uint64_t write = hsa_queue_load_write_index_relaxed(queue); + uint64_t read = hsa_queue_load_read_index_relaxed(queue); + if(write - read + 1 > queue->size) return false; + + Aql& dst = ring[write & mask]; + + uint16_t header = pkt.header.raw; + pkt.header.raw = dst.header.raw; + dst = pkt; + __atomic_store_n(&dst.header.raw, header, __ATOMIC_RELEASE); + pkt.header.raw = header; + + hsa_queue_store_write_index_release(queue, write + 1); + hsa_signal_store_screlease(queue->doorbell_signal, write); + + return true; + } + + void* hsa_malloc(size_t size, const Device::Memory& mem) + { + void* ret; + hsa_status_t err = hsa_amd_memory_pool_allocate(mem.pool, size, 0, &ret); + RET_IF_HSA_ERR(err); + + err = hsa_amd_agents_allow_access(all_devices.size(), all_devices.data(), nullptr, ret); + RET_IF_HSA_ERR(err); + return ret; + } + + void* hsa_malloc(size_t size, const Device& dev, bool fine) + { + uint32_t index = fine ? dev.fine : dev.coarse; + assert(index != -1u && "Memory type unavailable."); + return hsa_malloc(size, dev.pools[index]); + } + + bool device_discovery() + { + hsa_status_t err; + + err = hsa_iterate_agents( + [](hsa_agent_t agent, void* obj) { + hsa_status_t error; + + Device dev; + dev.agent = agent; + + dev.fine = -1u; + dev.coarse = -1u; + MQDependencyTest* _obj = reinterpret_cast(obj); + error = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, dev.name); + RET_IF_HSA_ERR(error) + + hsa_device_type_t type; + error = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + RET_IF_HSA_ERR(error) + + error = hsa_amd_agent_iterate_memory_pools( + agent, + [](hsa_amd_memory_pool_t pool, void* data) { + auto& pools = *reinterpret_cast*>(data); + hsa_status_t status; + + bool allowed = false; + status = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &allowed); + + if(!allowed) return HSA_STATUS_SUCCESS; + + hsa_amd_segment_t segment; + status = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + RET_IF_HSA_ERR(status) + + if(segment != HSA_AMD_SEGMENT_GLOBAL) return HSA_STATUS_SUCCESS; + + uint32_t flags; + status = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags); + RET_IF_HSA_ERR(status) + + Device::Memory mem; + mem.pool = pool; + mem.fine = ((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) != 0u); + mem.kernarg = + ((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) != 0u); + + status = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &mem.size); + RET_IF_HSA_ERR(status) + + status = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &mem.granule); + RET_IF_HSA_ERR(status) + + pools.push_back(mem); + return HSA_STATUS_SUCCESS; + }, + static_cast(&dev.pools)); + + if(!dev.pools.empty()) + { + for(size_t i = 0; i < dev.pools.size(); i++) + { + if(dev.pools[i].fine && dev.pools[i].kernarg && dev.fine == -1u) + dev.fine = i; + if(dev.pools[i].fine && !dev.pools[i].kernarg) dev.fine = i; + if(!dev.pools[i].fine) dev.coarse = i; + } + + if(type == HSA_DEVICE_TYPE_CPU) + _obj->cpu.push_back(dev); + else + _obj->gpu.push_back(dev); + + _obj->all_devices.push_back(dev.agent); + } + + return HSA_STATUS_SUCCESS; + }, + this); + + bool is_break = false; + for(auto& dev : cpu) + { + for(auto& mem : dev.pools) + { + if(mem.fine && mem.kernarg) + { + kernarg = mem; + is_break = true; + break; + } + } + if(is_break) break; + } + + RET_IF_HSA_ERR(err); + + if(cpu.empty() || gpu.empty() || kernarg.pool.handle == 0) return false; + return true; + } + + void search_hasco(const fs::path& directory, std::string& filename) + { + for(const auto& entry : fs::directory_iterator(directory)) + { + if(fs::is_regular_file(entry)) + { + if(entry.path().filename() == filename) + { + filename = entry.path(); + } + } + else if(fs::is_directory(entry)) + { + search_hasco(entry, filename); // Recursive call for subdirectories + } + } + } +}; diff --git a/tests/rocprofv3/CMakeLists.txt b/tests/rocprofv3/CMakeLists.txt index 70b8fe84..1404bc5e 100644 --- a/tests/rocprofv3/CMakeLists.txt +++ b/tests/rocprofv3/CMakeLists.txt @@ -39,3 +39,6 @@ add_subdirectory(collection-period) if(ROCPROFILER_BUILD_ROCDECODE_TESTS) add_subdirectory(rocdecode-trace) endif() +if(TARGET att_decoder_testing) + add_subdirectory(advanced-thread-trace) +endif() diff --git a/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt b/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt new file mode 100644 index 00000000..cfe9f89c --- /dev/null +++ b/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt @@ -0,0 +1,100 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-rocprofv3-att + LANGUAGES CXX + VERSION 0.0.0) + +string(REPLACE "LD_PRELOAD=" "--preload;" PRELOAD_ARGS + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py + att_input.json) + +find_package(rocprofiler-sdk REQUIRED) + +# hsa multiqueue dependency test + +add_test( + NAME rocprofv3-test-hsa-multiqueue-att-cmd-env-ld-lib-path-execute + COMMAND + $ --advanced-thread-trace 1 + --att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD + --att-buffer-size 0x6000000 --att-simd-select 0x3 --att-parse testing + --att-serialize-all 1 -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o + out --output-format json ${PRELOAD_ARGS} -- + $) + +set_tests_properties( + rocprofv3-test-hsa-multiqueue-att-cmd-env-ld-lib-path-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT + LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH} + FAIL_REGULAR_EXPRESSION "HSA_API|HIP_API") +# hsa multiqueue dependency test +add_test( + NAME rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute + COMMAND + $ --advanced-thread-trace 1 + --att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD + --att-buffer-size 0x6000000 --att-simd-select 0x3 --att-parse testing + --att-serialize-all 1 -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o + out --output-format json ${PRELOAD_ARGS} -- + $) + +set_tests_properties( + rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT + ATT_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib FAIL_REGULAR_EXPRESSION + "HSA_API|HIP_API") + +# hsa multiqueue dependency test +add_test( + NAME rocprofv3-test-hsa-multiqueue-att-json-execute + COMMAND + $ --att-library-path + ${CMAKE_BINARY_DIR}/lib -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/json_input -i + ${CMAKE_CURRENT_BINARY_DIR}/att_input.json ${PRELOAD_ARGS} -- + $) + +set_tests_properties( + rocprofv3-test-hsa-multiqueue-att-json-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" FAIL_REGULAR_EXPRESSION + "HSA_API|HIP_API") + +add_test( + NAME rocprofv3-test-hsa-multiqueue-att-cmd-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input + ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input/out_results.json + --code-object-input ${CMAKE_CURRENT_BINARY_DIR} --output-path + ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input) + +add_test( + NAME rocprofv3-test-hsa-multiqueue-att-json-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input + ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input/out_results.json + --code-object-input ${CMAKE_CURRENT_BINARY_DIR} --output-path + ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input) + +set(MULTIQUEUE_CMD_VALIDATION_FILES + ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input/out_results.json) + +set(MULTIQUEUE_JSON_VALIDATION_FILES + ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input/pass_1/out_results.json + ) + +set_tests_properties( + rocprofv3-test-hsa-multiqueue-att-cmd-validate + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS + "rocprofv3-test-hsa-multiqueue-att-cmd-ld-lib-path-execute" + FAIL_REGULAR_EXPRESSION "AssertionError") + +set_tests_properties( + rocprofv3-test-hsa-multiqueue-att-json-validate + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS + "rocprofv3-test-hsa-multiqueue-att-json-execute" FAIL_REGULAR_EXPRESSION + "AssertionError") diff --git a/tests/rocprofv3/advanced-thread-trace/att_input.json b/tests/rocprofv3/advanced-thread-trace/att_input.json new file mode 100644 index 00000000..6a27d932 --- /dev/null +++ b/tests/rocprofv3/advanced-thread-trace/att_input.json @@ -0,0 +1,20 @@ +{ + "jobs": [ + { + "kernel_include_regex": "copyD", + "kernel_exclude_regex": "", + "output_file": "out", + "output_format": [ + "json" + ], + "truncate_kernels": true, + "advanced_thread_trace": true, + "att_parse" : "testing", + "att_target_cu" : 1, + "att_shader_engine_mask" : "0x11", + "att_simd_select": "0x3", + "att_buffer_size": "0x6000000", + "att_perfcounters": "SQ_WAVES:0x1 SQ_INSTS_VALU:0x3 SQ_INSTS_SALU:0xF" + } + ] +} diff --git a/tests/rocprofv3/advanced-thread-trace/conftest.py b/tests/rocprofv3/advanced-thread-trace/conftest.py new file mode 100644 index 00000000..d694b6a6 --- /dev/null +++ b/tests/rocprofv3/advanced-thread-trace/conftest.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import csv +import pytest +import json + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list +from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader + +import re +import os + + +def pytest_addoption(parser): + + parser.addoption( + "--input", + action="store", + help="Path to JSON file.", + ) + parser.addoption( + "--code-object-input", + action="store", + help="Path to code object file.", + ) + parser.addoption( + "--output-path", + action="store", + help="Output Path.", + ) + + +@pytest.fixture +def json_data(request): + filename = request.config.getoption("--input") + with open(filename, "r") as inp: + return dotdict(collapse_dict_list(json.load(inp))) + + +@pytest.fixture +def output_path(request): + return request.config.getoption("--output-path") + + +@pytest.fixture +def code_object_file_path(request): + file_path = request.config.getoption("--code-object-input") + # hsa_file_load = re.compile(".*copy.hsaco$") + code_object_files = {} + code_object_memory = [] + hsa_memory_load_pattern = "gfx[a-z0-9]+_copy_memory.hsaco" + for root, dirs, files in os.walk(file_path, topdown=True): + for file in files: + filename = os.path.join(root, file) + if re.search(hsa_memory_load_pattern, filename): + code_object_memory.append(filename) + code_object_files["hsa_memory_load"] = code_object_memory + return code_object_files diff --git a/tests/rocprofv3/advanced-thread-trace/pytest.ini b/tests/rocprofv3/advanced-thread-trace/pytest.ini new file mode 100644 index 00000000..5e1e1c14 --- /dev/null +++ b/tests/rocprofv3/advanced-thread-trace/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/tests/rocprofv3/advanced-thread-trace/validate.py b/tests/rocprofv3/advanced-thread-trace/validate.py new file mode 100644 index 00000000..873c1c39 --- /dev/null +++ b/tests/rocprofv3/advanced-thread-trace/validate.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +import sys +import pytest +import re +import os + + +def test_json_data(json_data): + data = json_data["rocprofiler-sdk-tool"] + strings = data["strings"] + assert "att_filenames" in strings.keys() + att_files = data["strings"]["att_filenames"] + assert len(att_files) > 0 + + +def test_code_object_memory(code_object_file_path, json_data, output_path): + + data = json_data["rocprofiler-sdk-tool"] + tool_memory_load = data["strings"]["code_object_snapshot_filenames"] + gfx_pattern = "gfx[a-z0-9]+" + match = re.search(gfx_pattern, tool_memory_load[0]) + assert match != None + gpu_name = match.group(0) + tool_memory_load_1 = open(os.path.join(output_path, tool_memory_load[0]), "rb") + tool_memory_load_2 = open(os.path.join(output_path, tool_memory_load[1]), "rb") + found = False + for hsa_file in code_object_file_path["hsa_memory_load"]: + + m = re.search(gfx_pattern, hsa_file) + assert m != None + gpu = m.group(0) + + if gpu == gpu_name: + found = True + hsa_memory_load = open(hsa_file, "rb") + hsa_memory_fs = hsa_memory_load.read() + tool_memory_fs_1 = tool_memory_load_1.read() + tool_memory_fs_2 = tool_memory_load_2.read() + assert hsa_memory_fs == tool_memory_fs_2 or hsa_memory_fs == tool_memory_fs_1 + break + assert found == True + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code)