From 73e8381cba8bacb842f36d02e69b23dbfa4d4f4e Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Fri, 14 Feb 2025 22:14:04 -0600 Subject: [PATCH 1/4] Add script to compact trace files by process ID. --- script/dftracer_compact_by_pid | 142 +++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100755 script/dftracer_compact_by_pid diff --git a/script/dftracer_compact_by_pid b/script/dftracer_compact_by_pid new file mode 100755 index 0000000..be2ac8a --- /dev/null +++ b/script/dftracer_compact_by_pid @@ -0,0 +1,142 @@ +#!/bin/bash + +# The script compacts trace files by process ID and splits into files with the specified number of lines per file. + +compressed=0 +input_dir=$PWD +num_lines=10000 +output_dir=$PWD/output +output_prefix=app +override=0 +working_dir=$PWD + +function usage { + echo "usage: dftracer_compact_by_pid [-fcv] [-d input_directory] [-o output_directory] [-l num_lines] [-p output_prefix]" + echo " -f override output directory" + echo " -c compress output file" + echo " -v enable verbose mode" + echo " -d input_directory specify input directories (must contain .pfw or .pfw.gz files)" + echo " -o output_directory specify output directory" + echo " -l num_lines lines per output file" + echo " -p output_prefix prefix for output files" + exit 1 +} + +# Parse command-line arguments +while getopts ':cvfd:o:l:p:h' opt; do + case "$opt" in + d) + input_dir="${OPTARG}" + ;; + o) + output_dir="${OPTARG}" + ;; + l) + num_lines="${OPTARG}" + ;; + p) + output_prefix="${OPTARG}" + ;; + f) + override=1 + ;; + v) + set -x + ;; + c) + compressed=1 + ;; + h) + usage + exit 0 + ;; + :) + echo "Option $OPTARG requires an argument." + usage + exit 1 + ;; + ?) + echo "Invalid command option." + usage + exit 1 + ;; + esac +done +shift "$(($OPTIND - 1))" + +# Check and prepare the output directory +if [ ${override} -eq 0 ]; then + if [ -d "${output_dir}" ] && [ -n "$(ls -A "${output_dir}")" ]; then + echo "Error: Output directory ${output_dir} is not empty. Use -f to override." + exit 1 + fi +fi + +echo "Setting up output directory" +rm -rf "${output_dir}" +mkdir -p "${output_dir}" + +# Check for input traces +trace_files=$(find "$input_dir" -maxdepth 1 -type f \( -name "*.pfw" -o -name "*.gz" \)) +if [ -z "$trace_files" ]; then + echo "Error: No input traces found in '$input_dir'" + exit 1 +fi + +# Count number of trace files +num_files=$(echo "$trace_files" | wc -l) +echo "Found $num_files trace files. Now grouping them by process ID..." + +# Process .pfw and .pfw.gz files by extracting process ID and grouping data +for f in $trace_files; do + [[ -e "$f" ]] || break + file_name=$(basename "$f") + file_name="${file_name%.gz}" + file_name="${file_name%.pfw}" + IFS='-' read -r -a parts <<<"$file_name" + if [[ ${#parts[@]} -lt 3 ]]; then + echo "Error: Filename $f is missing fields to extract process ID." + continue + fi + pid=${parts[2]} + if ! [[ $pid =~ ^[0-9]+$ ]]; then + echo "Error: Component parts[2] of $f is not a numeric PID: $pid" + continue + fi + if [[ -r "$f" ]]; then + if [[ $f == *.gz ]]; then + gzip -dc "$f" | grep -v "^\[" | grep -v "^\]" | jq -c '.' >>"${output_dir}/temp_${pid}" + else + cat "$f" | grep -v "^\[" | grep -v "^\]" | jq -c '.' >>"${output_dir}/temp_${pid}" + fi + fi +done + +cd $output_dir + +# Split each temp_${pid} file into chunks and format as valid JSON arrays +for temp_file in temp_*; do + if [[ ! -f "$temp_file" ]]; then + continue + fi + pid=$(basename "$temp_file" | cut -d '_' -f 2) + echo "Processing PID $pid with $num_lines lines per file..." + split -l "$num_lines" --numeric-suffixes=1 --additional-suffix=.pfw "${temp_file}" "${output_prefix}-${pid}-" + for file in "${output_prefix}-${pid}-"*.pfw; do + if [[ -f "$file" ]]; then + echo "[" >"${file}_tmp" + cat "$file" >>"${file}_tmp" + echo "]" >>"${file}_tmp" + mv "${file}_tmp" "$file" + fi + done + rm "$temp_file" +done + +# Compress files if required +if [ $compressed -eq 1 ]; then + echo "Compressing files..." + gzip "${output_prefix}"-*.pfw +fi + +cd $working_dir From ec6b1e761485bf35446f801e09051f35d3694b52 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Fri, 14 Feb 2025 22:14:14 -0600 Subject: [PATCH 2/4] Add documentation for dftracer_compact_by_pid script usage. --- docs/utilities.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/utilities.rst b/docs/utilities.rst index c00c4d7..cda78de 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -49,6 +49,8 @@ Arguments for this script are 7. **-v** enable verbose mode. 8. **-h** display help +Alternatively, the **dftracer_compact_by_pid** script can be used to compact trace files by process ID using the same set of parameters. + ------------------ Sanitize script ------------------ From 6aad4f1ff80347c4a1ea11d2df246ba30ce5c9dc Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Fri, 14 Feb 2025 22:14:20 -0600 Subject: [PATCH 3/4] Add installation for dftracer_compact_by_pid script. --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e7add5..1a95dd6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -427,6 +427,14 @@ install( bin ) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/script/dftracer_compact_by_pid ${EXECUTABLE_OUTPUT_PATH}/dftracer_compact_by_pid COPYONLY) +install( + FILES + ${EXECUTABLE_OUTPUT_PATH}/dftracer_compact_by_pid + DESTINATION + bin +) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/script/dftracer_merge ${EXECUTABLE_OUTPUT_PATH}/dftracer_merge COPYONLY) install( FILES From cf2af2e01ed7f45e3b54b0052c7fa5a09123f009 Mon Sep 17 00:00:00 2001 From: Izzet Yildirim Date: Fri, 14 Feb 2025 22:15:20 -0600 Subject: [PATCH 4/4] Add dftracer_compact_by_pid to setup.py scripts and auto-format setup.py. --- setup.py | 125 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 73 insertions(+), 52 deletions(-) diff --git a/setup.py b/setup.py index 6a60cbd..e6708ad 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,12 @@ import os -import re +import pathlib +import site import subprocess import sys from pathlib import Path -print(sys.argv) -from setuptools import Extension, setup, find_namespace_packages, find_packages + +from setuptools import Extension, find_namespace_packages, setup from setuptools.command.build_ext import build_ext -import site # Convert distutils Windows platform specifiers to CMake -A arguments PLAT_TO_CMAKE = { @@ -31,6 +31,7 @@ def build_extension(self, ext: CMakeExtension) -> None: is_wheel = os.getenv("DFTRACER_WHEEL", "0") == "1" cmake_args = [] from distutils.sysconfig import get_python_lib + project_dir = Path.cwd() # Must be in this form due to bug in .resolve() only fixed in Python 3.10+ ext_fullpath = project_dir / self.get_ext_fullpath(ext.name) @@ -39,18 +40,18 @@ def build_extension(self, ext: CMakeExtension) -> None: print(f"{extdir}") install_prefix = f"{get_python_lib()}/dftracer" if "DFT_LOGGER_USER" in os.environ: - install_prefix=f"{site.USER_SITE}/dftracer" + install_prefix = f"{site.USER_SITE}/dftracer" # cmake_args += [f"-DUSER_INSTALL=ON"] if "DFTRACER_INSTALL_DIR" in os.environ: - install_prefix = os.environ['DFTRACER_INSTALL_DIR'] - + install_prefix = os.environ["DFTRACER_INSTALL_DIR"] + python_site = extdir if is_wheel: install_prefix = f"{extdir}/dftracer" if "DFTRACER_PYTHON_SITE" in os.environ: - python_site = os.environ['DFTRACER_PYTHON_SITE'] + python_site = os.environ["DFTRACER_PYTHON_SITE"] # if "DFTRACER_BUILD_DEPENDENCIES" not in os.environ or os.environ['DFTRACER_BUILD_DEPENDENCIES'] == "1": # dependency_file = open(f"{project_dir}/dependency/cpp.requirements.txt", 'r') @@ -63,10 +64,9 @@ def build_extension(self, ext: CMakeExtension) -> None: # os.system(f"bash {project_dir}/dependency/install_dependency.sh {parts[1]} {clone_dir} {install_prefix} {parts[2]} {need_install} {parts[4]} {parts[5]}") import pybind11 as py + py_cmake_dir = py.get_cmake_dir() # py_cmake_dir = os.popen('python3 -c " import pybind11 as py; print(py.get_cmake_dir())"').read() #python("-c", "import pybind11 as py; print(py.get_cmake_dir())", output=str).strip() - - # Using this requires trailing slash for auto-detection & inclusion of # auxiliary "native" libs @@ -81,14 +81,19 @@ def build_extension(self, ext: CMakeExtension) -> None: cmake_args += [f"-DDFTRACER_PYTHON_EXE={sys.executable}"] cmake_args += [f"-DDFTRACER_PYTHON_SITE={python_site}"] cmake_args += [f"-DCMAKE_INSTALL_PREFIX={install_prefix}"] - cmake_args += [f"-DCMAKE_PREFIX_PATH={install_prefix}", f"-Dpybind11_DIR={py_cmake_dir}"] - cmake_args += [f"-DPYBIND11_FINDPYTHON=ON"] + cmake_args += [ + f"-DCMAKE_PREFIX_PATH={install_prefix}", + f"-Dpybind11_DIR={py_cmake_dir}", + ] + cmake_args += ["-DPYBIND11_FINDPYTHON=ON"] cmake_args += ["-DDFTRACER_BUILD_PYTHON_BINDINGS=ON"] # Test related flags enable_tests = os.environ.get("DFTRACER_ENABLE_TESTS", "OFF") cmake_args += [f"-DDFTRACER_ENABLE_TESTS={enable_tests}"] - enable_dlio_tests = os.environ.get("DFTRACER_ENABLE_DLIO_BENCHMARK_TESTS", "OFF") + enable_dlio_tests = os.environ.get( + "DFTRACER_ENABLE_DLIO_BENCHMARK_TESTS", "OFF" + ) cmake_args += [f"-DDFTRACER_ENABLE_DLIO_BENCHMARK_TESTS={enable_dlio_tests}"] enable_dlio_tests = os.environ.get("DFTRACER_ENABLE_PAPER_TESTS", "OFF") cmake_args += [f"-DDFTRACER_ENABLE_PAPER_TESTS={enable_dlio_tests}"] @@ -107,7 +112,7 @@ def build_extension(self, ext: CMakeExtension) -> None: # f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}/lib", # f"-DPYTHON_EXECUTABLE={sys.executable}", # f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm - #] + # ] build_args = [] # Adding CMake arguments set as environment variable # (needed e.g. to build for ARM OSx on conda-forge) @@ -117,30 +122,37 @@ def build_extension(self, ext: CMakeExtension) -> None: # In this example, we pass in the version to C++. You might not need to. cmake_args += [f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"] - # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level # across all generators. - build_args += [f"--", "-j"] + build_args += ["--", "-j"] build_temp = Path(self.build_temp) / ext.name if not build_temp.exists(): build_temp.mkdir(parents=True) print("cmake", ext.sourcedir, cmake_args) - if "DFTRACER_BUILD_DEPENDENCIES" not in os.environ or os.environ['DFTRACER_BUILD_DEPENDENCIES'] == "1": + if ( + "DFTRACER_BUILD_DEPENDENCIES" not in os.environ + or os.environ["DFTRACER_BUILD_DEPENDENCIES"] == "1" + ): print("Installing dependencies.") install_cmake_args = cmake_args install_cmake_args += ["-DDFTRACER_INSTALL_DEPENDENCIES=ON"] subprocess.run( - ["cmake", ext.sourcedir, *install_cmake_args], cwd=build_temp, check=True + ["cmake", ext.sourcedir, *install_cmake_args], + cwd=build_temp, + check=True, ) subprocess.run( ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True ) cmake_args += ["-DDFTRACER_INSTALL_DEPENDENCIES=OFF"] # link correct depedencies - cmake_args += [f"-Dyaml-cpp_DIR={install_prefix}", f"-Dpybind11_DIR={py_cmake_dir}"] + cmake_args += [ + f"-Dyaml-cpp_DIR={install_prefix}", + f"-Dpybind11_DIR={py_cmake_dir}", + ] subprocess.run( ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True @@ -148,11 +160,9 @@ def build_extension(self, ext: CMakeExtension) -> None: subprocess.run( ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True ) - subprocess.run( - ["cmake", "--install", "."], cwd=build_temp, check=True - ) + subprocess.run(["cmake", "--install", "."], cwd=build_temp, check=True) + -import pathlib here = pathlib.Path(__file__).parent.resolve() long_description = (here / "README.md").read_text(encoding="utf-8") # The information here can also be placed in setup.cfg - better separation of @@ -186,41 +196,52 @@ def build_extension(self, ext: CMakeExtension) -> None: "Programming Language :: Python :: 3 :: Only", ], install_requires=["pybind11", "zindex_py==0.0.4"], - requires=["pybind11","setuptools"], + requires=["pybind11", "setuptools"], keywords="profiler, deep learning, I/O, benchmark, NPZ, pytorch benchmark, tensorflow benchmark", project_urls={ # Optional "Bug Reports": "https://github.com/hariharan-devarajan/dftracer/issues", "Source": "https://github.com/hariharan-devarajan/dftracer", }, - packages=(find_namespace_packages(include=['dftracer', 'dftracer_dbg', 'dfanalyzer'])), - scripts=['script/dftracer_compact', - 'script/dftracer_merge', - 'script/dftracer_sanitize', - 'script/dftracer_anonymize', - 'script/dftracer_split', - 'script/dftracer_create_index', - 'script/dftracer_event_count', - 'script/dftracer_validate', ], - package_dir={"dftracer": "dftracer", - "dftracer_dbg": "dftracer_dbg", - "dfanalyzer": "dfanalyzer"}, - ext_modules=[CMakeExtension("dftracer.pydftracer"), - CMakeExtension("dftracer.pydftracer_dbg")], + packages=( + find_namespace_packages(include=["dftracer", "dftracer_dbg", "dfanalyzer"]) + ), + scripts=[ + "script/dftracer_compact", + "script/dftracer_compact_by_pid", + "script/dftracer_merge", + "script/dftracer_sanitize", + "script/dftracer_anonymize", + "script/dftracer_split", + "script/dftracer_create_index", + "script/dftracer_event_count", + "script/dftracer_validate", + ], + package_dir={ + "dftracer": "dftracer", + "dftracer_dbg": "dftracer_dbg", + "dfanalyzer": "dfanalyzer", + }, + ext_modules=[ + CMakeExtension("dftracer.pydftracer"), + CMakeExtension("dftracer.pydftracer_dbg"), + ], cmdclass={"build_ext": CMakeBuild}, zip_safe=False, - extras_require={"test": ["pytest>=6.0"], - "dfanalyzer": [ - "seaborn>=0.13.2", - "bokeh>=2.4.2", - "pybind11", - "pandas>=2.0.3", - "dask>=2023.5.0", - "distributed", - "numpy>=1.24.3", - "pyarrow>=12.0.1", - "rich>=13.6.0", - "python-intervals>=1.10.0.post1", - "matplotlib>=3.7.3", - ]}, + extras_require={ + "test": ["pytest>=6.0"], + "dfanalyzer": [ + "seaborn>=0.13.2", + "bokeh>=2.4.2", + "pybind11", + "pandas>=2.0.3", + "dask>=2023.5.0", + "distributed", + "numpy>=1.24.3", + "pyarrow>=12.0.1", + "rich>=13.6.0", + "python-intervals>=1.10.0.post1", + "matplotlib>=3.7.3", + ], + }, python_requires=">=3.7", )