Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dftracer_compact_by_pid Script #238

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,14 @@ install(
bin
)

configure_file(${CMAKE_CURRENT_SOURCE_DIR}/script/dftracer_compact_by_pid ${EXECUTABLE_OUTPUT_PATH}/dftracer_compact_by_pid COPYONLY)
install(
FILES
${EXECUTABLE_OUTPUT_PATH}/dftracer_compact_by_pid
DESTINATION
bin
)

configure_file(${CMAKE_CURRENT_SOURCE_DIR}/script/dftracer_merge ${EXECUTABLE_OUTPUT_PATH}/dftracer_merge COPYONLY)
install(
FILES
Expand Down
2 changes: 2 additions & 0 deletions docs/utilities.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ Arguments for this script are
7. **-v** enable verbose mode.
8. **-h** display help

Alternatively, the **dftracer_compact_by_pid** script can be used to compact trace files by process ID using the same set of parameters.

------------------
Sanitize script
------------------
Expand Down
142 changes: 142 additions & 0 deletions script/dftracer_compact_by_pid
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/bin/bash

# The script compacts trace files by process ID and splits into files with the specified number of lines per file.

compressed=0
input_dir=$PWD
num_lines=10000
output_dir=$PWD/output
output_prefix=app
override=0
working_dir=$PWD

function usage {
echo "usage: dftracer_compact_by_pid [-fcv] [-d input_directory] [-o output_directory] [-l num_lines] [-p output_prefix]"
echo " -f override output directory"
echo " -c compress output file"
echo " -v enable verbose mode"
echo " -d input_directory specify input directories (must contain .pfw or .pfw.gz files)"
echo " -o output_directory specify output directory"
echo " -l num_lines lines per output file"
echo " -p output_prefix prefix for output files"
exit 1
}

# Parse command-line arguments
while getopts ':cvfd:o:l:p:h' opt; do
case "$opt" in
d)
input_dir="${OPTARG}"
;;
o)
output_dir="${OPTARG}"
;;
l)
num_lines="${OPTARG}"
;;
p)
output_prefix="${OPTARG}"
;;
f)
override=1
;;
v)
set -x
;;
c)
compressed=1
;;
h)
usage
exit 0
;;
:)
echo "Option $OPTARG requires an argument."
usage
exit 1
;;
?)
echo "Invalid command option."
usage
exit 1
;;
esac
done
shift "$(($OPTIND - 1))"

# Check and prepare the output directory
if [ ${override} -eq 0 ]; then
if [ -d "${output_dir}" ] && [ -n "$(ls -A "${output_dir}")" ]; then
echo "Error: Output directory ${output_dir} is not empty. Use -f to override."
exit 1
fi
fi

echo "Setting up output directory"
rm -rf "${output_dir}"
mkdir -p "${output_dir}"

# Check for input traces
trace_files=$(find "$input_dir" -maxdepth 1 -type f \( -name "*.pfw" -o -name "*.gz" \))
if [ -z "$trace_files" ]; then
echo "Error: No input traces found in '$input_dir'"
exit 1
fi

# Count number of trace files
num_files=$(echo "$trace_files" | wc -l)
echo "Found $num_files trace files. Now grouping them by process ID..."

# Process .pfw and .pfw.gz files by extracting process ID and grouping data
for f in $trace_files; do
[[ -e "$f" ]] || break
file_name=$(basename "$f")
file_name="${file_name%.gz}"
file_name="${file_name%.pfw}"
IFS='-' read -r -a parts <<<"$file_name"
if [[ ${#parts[@]} -lt 3 ]]; then
echo "Error: Filename $f is missing fields to extract process ID."
continue
fi
pid=${parts[2]}
if ! [[ $pid =~ ^[0-9]+$ ]]; then
echo "Error: Component parts[2] of $f is not a numeric PID: $pid"
continue
fi
if [[ -r "$f" ]]; then
if [[ $f == *.gz ]]; then
gzip -dc "$f" | grep -v "^\[" | grep -v "^\]" | jq -c '.' >>"${output_dir}/temp_${pid}"
else
cat "$f" | grep -v "^\[" | grep -v "^\]" | jq -c '.' >>"${output_dir}/temp_${pid}"
fi
fi
done

cd $output_dir

# Split each temp_${pid} file into chunks and format as valid JSON arrays
for temp_file in temp_*; do
if [[ ! -f "$temp_file" ]]; then
continue
fi
pid=$(basename "$temp_file" | cut -d '_' -f 2)
echo "Processing PID $pid with $num_lines lines per file..."
split -l "$num_lines" --numeric-suffixes=1 --additional-suffix=.pfw "${temp_file}" "${output_prefix}-${pid}-"
for file in "${output_prefix}-${pid}-"*.pfw; do
if [[ -f "$file" ]]; then
echo "[" >"${file}_tmp"
cat "$file" >>"${file}_tmp"
echo "]" >>"${file}_tmp"
mv "${file}_tmp" "$file"
fi
done
rm "$temp_file"
done

# Compress files if required
if [ $compressed -eq 1 ]; then
echo "Compressing files..."
gzip "${output_prefix}"-*.pfw
fi

cd $working_dir
125 changes: 73 additions & 52 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
import re
import pathlib
import site
import subprocess
import sys
from pathlib import Path
print(sys.argv)
from setuptools import Extension, setup, find_namespace_packages, find_packages

from setuptools import Extension, find_namespace_packages, setup
from setuptools.command.build_ext import build_ext
import site

# Convert distutils Windows platform specifiers to CMake -A arguments
PLAT_TO_CMAKE = {
Expand All @@ -31,6 +31,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
is_wheel = os.getenv("DFTRACER_WHEEL", "0") == "1"
cmake_args = []
from distutils.sysconfig import get_python_lib

project_dir = Path.cwd()
# Must be in this form due to bug in .resolve() only fixed in Python 3.10+
ext_fullpath = project_dir / self.get_ext_fullpath(ext.name)
Expand All @@ -39,18 +40,18 @@ def build_extension(self, ext: CMakeExtension) -> None:
print(f"{extdir}")
install_prefix = f"{get_python_lib()}/dftracer"
if "DFT_LOGGER_USER" in os.environ:
install_prefix=f"{site.USER_SITE}/dftracer"
install_prefix = f"{site.USER_SITE}/dftracer"
# cmake_args += [f"-DUSER_INSTALL=ON"]
if "DFTRACER_INSTALL_DIR" in os.environ:
install_prefix = os.environ['DFTRACER_INSTALL_DIR']
install_prefix = os.environ["DFTRACER_INSTALL_DIR"]

python_site = extdir

if is_wheel:
install_prefix = f"{extdir}/dftracer"

if "DFTRACER_PYTHON_SITE" in os.environ:
python_site = os.environ['DFTRACER_PYTHON_SITE']
python_site = os.environ["DFTRACER_PYTHON_SITE"]

# if "DFTRACER_BUILD_DEPENDENCIES" not in os.environ or os.environ['DFTRACER_BUILD_DEPENDENCIES'] == "1":
# dependency_file = open(f"{project_dir}/dependency/cpp.requirements.txt", 'r')
Expand All @@ -63,10 +64,9 @@ def build_extension(self, ext: CMakeExtension) -> None:
# os.system(f"bash {project_dir}/dependency/install_dependency.sh {parts[1]} {clone_dir} {install_prefix} {parts[2]} {need_install} {parts[4]} {parts[5]}")

import pybind11 as py

py_cmake_dir = py.get_cmake_dir()
# py_cmake_dir = os.popen('python3 -c " import pybind11 as py; print(py.get_cmake_dir())"').read() #python("-c", "import pybind11 as py; print(py.get_cmake_dir())", output=str).strip()



# Using this requires trailing slash for auto-detection & inclusion of
# auxiliary "native" libs
Expand All @@ -81,14 +81,19 @@ def build_extension(self, ext: CMakeExtension) -> None:
cmake_args += [f"-DDFTRACER_PYTHON_EXE={sys.executable}"]
cmake_args += [f"-DDFTRACER_PYTHON_SITE={python_site}"]
cmake_args += [f"-DCMAKE_INSTALL_PREFIX={install_prefix}"]
cmake_args += [f"-DCMAKE_PREFIX_PATH={install_prefix}", f"-Dpybind11_DIR={py_cmake_dir}"]
cmake_args += [f"-DPYBIND11_FINDPYTHON=ON"]
cmake_args += [
f"-DCMAKE_PREFIX_PATH={install_prefix}",
f"-Dpybind11_DIR={py_cmake_dir}",
]
cmake_args += ["-DPYBIND11_FINDPYTHON=ON"]
cmake_args += ["-DDFTRACER_BUILD_PYTHON_BINDINGS=ON"]
# Test related flags

enable_tests = os.environ.get("DFTRACER_ENABLE_TESTS", "OFF")
cmake_args += [f"-DDFTRACER_ENABLE_TESTS={enable_tests}"]
enable_dlio_tests = os.environ.get("DFTRACER_ENABLE_DLIO_BENCHMARK_TESTS", "OFF")
enable_dlio_tests = os.environ.get(
"DFTRACER_ENABLE_DLIO_BENCHMARK_TESTS", "OFF"
)
cmake_args += [f"-DDFTRACER_ENABLE_DLIO_BENCHMARK_TESTS={enable_dlio_tests}"]
enable_dlio_tests = os.environ.get("DFTRACER_ENABLE_PAPER_TESTS", "OFF")
cmake_args += [f"-DDFTRACER_ENABLE_PAPER_TESTS={enable_dlio_tests}"]
Expand All @@ -107,7 +112,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
# f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}/lib",
# f"-DPYTHON_EXECUTABLE={sys.executable}",
# f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
#]
# ]
build_args = []
# Adding CMake arguments set as environment variable
# (needed e.g. to build for ARM OSx on conda-forge)
Expand All @@ -117,42 +122,47 @@ def build_extension(self, ext: CMakeExtension) -> None:
# In this example, we pass in the version to C++. You might not need to.
cmake_args += [f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]


# Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
# across all generators.
build_args += [f"--", "-j"]
build_args += ["--", "-j"]

build_temp = Path(self.build_temp) / ext.name
if not build_temp.exists():
build_temp.mkdir(parents=True)
print("cmake", ext.sourcedir, cmake_args)

if "DFTRACER_BUILD_DEPENDENCIES" not in os.environ or os.environ['DFTRACER_BUILD_DEPENDENCIES'] == "1":
if (
"DFTRACER_BUILD_DEPENDENCIES" not in os.environ
or os.environ["DFTRACER_BUILD_DEPENDENCIES"] == "1"
):
print("Installing dependencies.")
install_cmake_args = cmake_args
install_cmake_args += ["-DDFTRACER_INSTALL_DEPENDENCIES=ON"]

subprocess.run(
["cmake", ext.sourcedir, *install_cmake_args], cwd=build_temp, check=True
["cmake", ext.sourcedir, *install_cmake_args],
cwd=build_temp,
check=True,
)
subprocess.run(
["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
)
cmake_args += ["-DDFTRACER_INSTALL_DEPENDENCIES=OFF"]
# link correct depedencies
cmake_args += [f"-Dyaml-cpp_DIR={install_prefix}", f"-Dpybind11_DIR={py_cmake_dir}"]
cmake_args += [
f"-Dyaml-cpp_DIR={install_prefix}",
f"-Dpybind11_DIR={py_cmake_dir}",
]

subprocess.run(
["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True
)
subprocess.run(
["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
)
subprocess.run(
["cmake", "--install", "."], cwd=build_temp, check=True
)
subprocess.run(["cmake", "--install", "."], cwd=build_temp, check=True)


import pathlib
here = pathlib.Path(__file__).parent.resolve()
long_description = (here / "README.md").read_text(encoding="utf-8")
# The information here can also be placed in setup.cfg - better separation of
Expand Down Expand Up @@ -186,41 +196,52 @@ def build_extension(self, ext: CMakeExtension) -> None:
"Programming Language :: Python :: 3 :: Only",
],
install_requires=["pybind11", "zindex_py==0.0.4"],
requires=["pybind11","setuptools"],
requires=["pybind11", "setuptools"],
keywords="profiler, deep learning, I/O, benchmark, NPZ, pytorch benchmark, tensorflow benchmark",
project_urls={ # Optional
"Bug Reports": "https://github.com/hariharan-devarajan/dftracer/issues",
"Source": "https://github.com/hariharan-devarajan/dftracer",
},
packages=(find_namespace_packages(include=['dftracer', 'dftracer_dbg', 'dfanalyzer'])),
scripts=['script/dftracer_compact',
'script/dftracer_merge',
'script/dftracer_sanitize',
'script/dftracer_anonymize',
'script/dftracer_split',
'script/dftracer_create_index',
'script/dftracer_event_count',
'script/dftracer_validate', ],
package_dir={"dftracer": "dftracer",
"dftracer_dbg": "dftracer_dbg",
"dfanalyzer": "dfanalyzer"},
ext_modules=[CMakeExtension("dftracer.pydftracer"),
CMakeExtension("dftracer.pydftracer_dbg")],
packages=(
find_namespace_packages(include=["dftracer", "dftracer_dbg", "dfanalyzer"])
),
scripts=[
"script/dftracer_compact",
"script/dftracer_compact_by_pid",
"script/dftracer_merge",
"script/dftracer_sanitize",
"script/dftracer_anonymize",
"script/dftracer_split",
"script/dftracer_create_index",
"script/dftracer_event_count",
"script/dftracer_validate",
],
package_dir={
"dftracer": "dftracer",
"dftracer_dbg": "dftracer_dbg",
"dfanalyzer": "dfanalyzer",
},
ext_modules=[
CMakeExtension("dftracer.pydftracer"),
CMakeExtension("dftracer.pydftracer_dbg"),
],
cmdclass={"build_ext": CMakeBuild},
zip_safe=False,
extras_require={"test": ["pytest>=6.0"],
"dfanalyzer": [
"seaborn>=0.13.2",
"bokeh>=2.4.2",
"pybind11",
"pandas>=2.0.3",
"dask>=2023.5.0",
"distributed",
"numpy>=1.24.3",
"pyarrow>=12.0.1",
"rich>=13.6.0",
"python-intervals>=1.10.0.post1",
"matplotlib>=3.7.3",
]},
extras_require={
"test": ["pytest>=6.0"],
"dfanalyzer": [
"seaborn>=0.13.2",
"bokeh>=2.4.2",
"pybind11",
"pandas>=2.0.3",
"dask>=2023.5.0",
"distributed",
"numpy>=1.24.3",
"pyarrow>=12.0.1",
"rich>=13.6.0",
"python-intervals>=1.10.0.post1",
"matplotlib>=3.7.3",
],
},
python_requires=">=3.7",
)
Loading