Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add opentelemery helper scripts (#119) #120

Merged
merged 15 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/prs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,6 @@ jobs:
run: |
pip install pytest
pytest tests
- name: Run bash tests
run: |
bash tests/test_rapids-otel-wrap.sh
66 changes: 66 additions & 0 deletions tests/test_rapids-get-telemetry-trace-id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import sys
import os.path
import subprocess

TOOLS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "tools")

def test_rapids_compute_trace_id():
result = subprocess.run(
os.path.join(TOOLS_DIR, "rapids-get-telemetry-trace-id"),
env={
"GITHUB_REPOSITORY": "rapidsai/gha-tools",
"GITHUB_RUN_ID": "1123123",
"GITHUB_RUN_ATTEMPT": "1"
},
text=True,
capture_output=True,
)
assert result.stdout.strip() == "22ab4ec60f37f446b4a95917e86660df"
assert result.stderr == ""
assert result.returncode == 0

def test_rapids_get_traceparent():
# this should raise, because OTEL_SERVICE_NAME isn't set
try:
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent")],
env={
"GITHUB_REPOSITORY": "rapidsai/gha-tools",
"GITHUB_RUN_ID": "1123123",
"GITHUB_RUN_ATTEMPT": "1"
},
text=True,
capture_output=True,
)
except subprocess.CalledProcessError:
pass
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent"), "my_job"],
env={
"GITHUB_REPOSITORY": "rapidsai/gha-tools",
"GITHUB_RUN_ID": "1123123",
"GITHUB_RUN_ATTEMPT": "1",
},
text=True,
capture_output=True,
)
assert result.stdout.strip() == "00-22ab4ec60f37f446b4a95917e86660df-5f57388b5b07a3e8-01"
assert result.stderr == """JOB_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job\"
STEP_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job-\"\n"""
assert result.returncode == 0

def test_rapids_get_traceparent_with_step():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent"), "my_job", "my step"],
env={
"GITHUB_REPOSITORY": "rapidsai/gha-tools",
"GITHUB_RUN_ID": "1123123",
"GITHUB_RUN_ATTEMPT": "1",
},
text=True,
capture_output=True,
)
assert result.stdout.strip() == "00-22ab4ec60f37f446b4a95917e86660df-a6e5bc57fad91889-01"
assert result.stderr == """JOB_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job\"
STEP_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job-my step\"\n"""
assert result.returncode == 0
54 changes: 54 additions & 0 deletions tests/test_rapids-otel-wrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@

import sys
import os.path
import subprocess

TOOLS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "tools")

def test_wrap_otel():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "echo", "bob"],
text=True,
capture_output=True,
)
assert result.stdout == "bob\n"
assert result.returncode == 0

def test_wrap_otel_with_spaces():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "echo", "-n", "bob is here"],
text=True,
capture_output=True,
)
# Note: no newline here, because echo -n shouldn't end with a newline
assert result.stdout == "bob is here"
assert result.returncode == 0

def test_wrap_otel_with_spaces_and_parens():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "python", "-c", "import sys; print(sys.version)"],
text=True,
capture_output=True,
)
assert result.stdout == "{}\n".format(sys.version)
assert result.returncode == 0

def test_wrap_otel_with_evil_comparison_operators():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "python", "-c", 'print(str(1<2))'],
text=True,
capture_output=True,
)
assert result.stdout == "True\n"
assert result.returncode == 0

# This differs from the test above in that everything is combined into one string, and we're running it as a true shell
def test_wrap_otel_with_evil_comparison_operators_with_shell():
result = subprocess.run(
'{} python -c "print(str(1<2))"'.format(os.path.join(TOOLS_DIR, "rapids-otel-wrap")),
text=True,
capture_output=True,
shell=True
)
assert result.stdout == "True\n"
assert result.returncode == 0
37 changes: 37 additions & 0 deletions tests/test_rapids-otel-wrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
#

SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "abc")
if [ "$out" != "abc" ]; then
echo "error on simple echo case"; exit 1;
fi

out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "arg with a space")
if [ "$out" != "arg with a space" ]; then
echo "error on space case"; exit 1;
fi

out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "cmd" "arg with a space" --somearg '"blah blah"')
if [ "$out" != 'cmd arg with a space --somearg "blah blah"' ]; then
echo "error on harder space case"; exit 1;
fi

out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap cat <<EOF
arg" with a space --somearg 'a<b'
EOF
);
if [ "$out" != "arg\" with a space --somearg 'a<b'" ]; then
echo "error on inequality case as heredoc";
echo "output was:"
echo "$out"
exit 1;
fi

out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "a<b");
if [ "$out" != "a<b" ]; then
echo "error on inequality case as arg";
echo "output was:"
echo "$out"
exit 1;
fi
4 changes: 3 additions & 1 deletion tools/rapids-conda-retry
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ condaCmd=${RAPIDS_CONDA_EXE:=conda}
# needToRetry: 1 if the command should be retried, 0 if it should not be
function runConda {
# shellcheck disable=SC2086
${condaCmd} ${args} 2>&1| tee "${outfile}"
# RAPIDS_OTEL_WRAPPER is optionally passed in as an env var. It is
# used to instrument conda-build or mambabuild for finer-grained spans.
${RAPIDS_OTEL_WRAPPER:-} ${condaCmd} ${args} 2>&1| tee "${outfile}"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes the wrapper opt-in, which I hope avoids any lingering possible issues that occurred last time we tried to add this. The wrapper won't be present at all unless it is added as an environment variable. This will happen in shared-workflows files.

exitcode=$?
needToRetry=0
needToClean=0
Expand Down
11 changes: 11 additions & 0 deletions tools/rapids-get-telemetry-trace-id
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
# This is a global, per-run identifier. It is the same across all jobs and all steps within all jobs.
# It is constant from the source repo, to shared-workflows, to shared-actions.

if [ "$GITHUB_REPOSITORY" = "" ] || [ "${GITHUB_RUN_ID}" = "" ] || [ "${GITHUB_RUN_ATTEMPT}" = "" ]; then
echo "Error: one or more inputs to trace id is empty. The variables that must be set are:"
echo " GITHUB_REPOSITORY, GITHUB_RUN_ID, and GITHUB_RUN_ATTEMPT"
exit 1
fi
sha="$(echo "${GITHUB_REPOSITORY}+${GITHUB_RUN_ID}+${GITHUB_RUN_ATTEMPT}" | sha256sum | cut -f1 -d' ')"
echo "${sha:0:32}"
38 changes: 38 additions & 0 deletions tools/rapids-get-telemetry-traceparent
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
# This emits a TRACEPARENT, which follows the w3c trace context standard.
# https://www.w3.org/TR/trace-context/
#
# This script can operate for two purposes:
# 1. The top level of a job, whether it is the job at the source repo (e.g. rmm) level, or
# the matrix job level
# 2. The steps level within a job, which uses both the job name and the step name
#
# The job name must always be provided as the first argument.
# A step name MAY be provided as the second argument. If it is specified, the output corresponds to
# the step within the context of its job.

JOB_NAME=$1
STEP_NAME=${2:-}
msarahan marked this conversation as resolved.
Show resolved Hide resolved

if [ "$JOB_NAME" = "" ]; then
echo "ERROR: JOB_NAME (first parameter) is empty. This means your trace doesn't identify anything."
exit 1
fi

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

TRACE_ID="$("${SCRIPT_DIR}"/rapids-get-telemetry-trace-id)"
JOB_SPAN_ID="${TRACE_ID}-${JOB_NAME}"
STEP_SPAN_ID="${JOB_SPAN_ID}-${STEP_NAME}"

echo "JOB_SPAN_ID pre-hash: \"$JOB_SPAN_ID\"" 1>&2
echo "STEP_SPAN_ID pre-hash: \"$STEP_SPAN_ID\"" 1>&2

JOB_TRACEPARENT=$(echo -n "${JOB_SPAN_ID}" | sha256sum | cut -f1 -d' ')
STEP_TRACEPARENT=$(echo -n "${STEP_SPAN_ID}" | sha256sum | cut -f1 -d' ')

if [ "${STEP_NAME}" != "" ]; then
echo "00-${TRACE_ID}-${STEP_TRACEPARENT:0:16}-01"
else
echo "00-${TRACE_ID}-${JOB_TRACEPARENT:0:16}-01"
fi
66 changes: 66 additions & 0 deletions tools/rapids-otel-wrap
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash
# Wraps arbitrary commands with arbitrary args. Emits an OpenTelemetry span for tracing the command
#
set -x

SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

RAPIDS_OTEL_TRACES_EXPORTER="${RAPIDS_OTEL_TRACES_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}"
RAPIDS_OTEL_METRICS_EXPORTER="${RAPIDS_OTEL_METRICS_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}"
RAPIDS_OTEL_LOGS_EXPORTER="${RAPIDS_OTEL_LOGS_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}"
OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/traces}"
OTEL_EXPORTER_OTLP_METRICS_ENDPOINT="${OTEL_EXPORTER_OTLP_METRICS_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/metrics}"
OTEL_EXPORTER_OTLP_LOGS_ENDPOINT="${OTEL_EXPORTER_OTLP_LOGS_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/logs}"
export TRACEPARENT
export OTEL_SERVICE_NAME

set -x

if [[ $(type otel-cli >/dev/null 2>&1) -eq 0 ]] && [ "$TRACEPARENT" != "" ]; then
echo "Running command with OpenTelemetry instrumentation" >&2;

if [ "$OTEL_SERVICE_NAME" = "" ]; then
echo "WARNING: OTEL_SERVICE_NAME variable not provided. Traces from different steps may not be associated correctly. >&2"
fi

# Some commands have instrumentation. For example, conda-build has monkey-patched instrumentation
# that can be activated with the opentelemetry-instrument command. For these commands,
# we replace the command with the wrapped command, quoted as a whole for the purposes
# of otel-cli exec, so that flags don't get confused.
case "$1" in
conda* )
echo "using opentelemetry-instrument for command" >&2;
echo "TRACEPARENT prior to otel-cli exec is: \"${TRACEPARENT}\"" >&2;
STEP_TRACEPARENT=$("${SCRIPT_DIR}/rapids-get-telemetry-traceparent" "${OTEL_SERVICE_NAME}" "${STEP_NAME}")

# otel-cli creates a span for us that bridges the traceparent from the parent process
# into the command we're wrapping
# --service "${OTEL_SERVICE_NAME}" \
# shellcheck disable=SC2086,SC2048
otel-cli exec \
--name "Run instrumented \"$*\"" \
--force-parent-span-id "$(cut -d'-' -f3 <<<"$STEP_TRACEPARENT")" \
--verbose \
-- \
opentelemetry-instrument \
"$@"
;;
* )
echo "No opentelemetry instrumentation known for command $*" >&2;
# shellcheck disable=SC2086,SC2048
otel-cli exec \
--name "Run instrumented \"$*\"" \
--force-parent-span-id "$(cut -d'-' -f3 <<<"$STEP_TRACEPARENT")" \
--verbose \
-- "$@"
;;
esac
RETURN_STATUS=$?
else
# echo "Telemetry disabled from lack of otel-cli on PATH or no TRACEPARENT set" >&2;
# echo "Running command unmodified" >&2;
"$@"
RETURN_STATUS=$?
fi

exit "${RETURN_STATUS}"