diff --git a/.github/workflows/prs.yaml b/.github/workflows/prs.yaml index 8da91da..8137f4c 100644 --- a/.github/workflows/prs.yaml +++ b/.github/workflows/prs.yaml @@ -49,3 +49,6 @@ jobs: run: | pip install pytest pytest tests + - name: Run bash tests + run: | + bash tests/test_rapids-otel-wrap.sh diff --git a/tests/test_rapids-get-telemetry-trace-id.py b/tests/test_rapids-get-telemetry-trace-id.py new file mode 100644 index 0000000..2247e57 --- /dev/null +++ b/tests/test_rapids-get-telemetry-trace-id.py @@ -0,0 +1,66 @@ +import sys +import os.path +import subprocess + +TOOLS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "tools") + +def test_rapids_compute_trace_id(): + result = subprocess.run( + os.path.join(TOOLS_DIR, "rapids-get-telemetry-trace-id"), + env={ + "GITHUB_REPOSITORY": "rapidsai/gha-tools", + "GITHUB_RUN_ID": "1123123", + "GITHUB_RUN_ATTEMPT": "1" + }, + text=True, + capture_output=True, + ) + assert result.stdout.strip() == "22ab4ec60f37f446b4a95917e86660df" + assert result.stderr == "" + assert result.returncode == 0 + +def test_rapids_get_traceparent(): + # this should raise, because OTEL_SERVICE_NAME isn't set + try: + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent")], + env={ + "GITHUB_REPOSITORY": "rapidsai/gha-tools", + "GITHUB_RUN_ID": "1123123", + "GITHUB_RUN_ATTEMPT": "1" + }, + text=True, + capture_output=True, + ) + except subprocess.CalledProcessError: + pass + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent"), "my_job"], + env={ + "GITHUB_REPOSITORY": "rapidsai/gha-tools", + "GITHUB_RUN_ID": "1123123", + "GITHUB_RUN_ATTEMPT": "1", + }, + text=True, + capture_output=True, + ) + assert result.stdout.strip() == "00-22ab4ec60f37f446b4a95917e86660df-5f57388b5b07a3e8-01" + assert result.stderr == """JOB_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job\" +STEP_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job-\"\n""" + assert result.returncode == 0 + +def test_rapids_get_traceparent_with_step(): + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent"), "my_job", "my step"], + env={ + "GITHUB_REPOSITORY": "rapidsai/gha-tools", + "GITHUB_RUN_ID": "1123123", + "GITHUB_RUN_ATTEMPT": "1", + }, + text=True, + capture_output=True, + ) + assert result.stdout.strip() == "00-22ab4ec60f37f446b4a95917e86660df-a6e5bc57fad91889-01" + assert result.stderr == """JOB_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job\" +STEP_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job-my step\"\n""" + assert result.returncode == 0 diff --git a/tests/test_rapids-otel-wrap.py b/tests/test_rapids-otel-wrap.py new file mode 100644 index 0000000..72092ee --- /dev/null +++ b/tests/test_rapids-otel-wrap.py @@ -0,0 +1,54 @@ + +import sys +import os.path +import subprocess + +TOOLS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "tools") + +def test_wrap_otel(): + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "echo", "bob"], + text=True, + capture_output=True, + ) + assert result.stdout == "bob\n" + assert result.returncode == 0 + +def test_wrap_otel_with_spaces(): + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "echo", "-n", "bob is here"], + text=True, + capture_output=True, + ) + # Note: no newline here, because echo -n shouldn't end with a newline + assert result.stdout == "bob is here" + assert result.returncode == 0 + +def test_wrap_otel_with_spaces_and_parens(): + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "python", "-c", "import sys; print(sys.version)"], + text=True, + capture_output=True, + ) + assert result.stdout == "{}\n".format(sys.version) + assert result.returncode == 0 + +def test_wrap_otel_with_evil_comparison_operators(): + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "python", "-c", 'print(str(1<2))'], + text=True, + capture_output=True, + ) + assert result.stdout == "True\n" + assert result.returncode == 0 + +# This differs from the test above in that everything is combined into one string, and we're running it as a true shell +def test_wrap_otel_with_evil_comparison_operators_with_shell(): + result = subprocess.run( + '{} python -c "print(str(1<2))"'.format(os.path.join(TOOLS_DIR, "rapids-otel-wrap")), + text=True, + capture_output=True, + shell=True + ) + assert result.stdout == "True\n" + assert result.returncode == 0 diff --git a/tests/test_rapids-otel-wrap.sh b/tests/test_rapids-otel-wrap.sh new file mode 100644 index 0000000..7d58bb5 --- /dev/null +++ b/tests/test_rapids-otel-wrap.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# + +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "abc") +if [ "$out" != "abc" ]; then + echo "error on simple echo case"; exit 1; +fi + +out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "arg with a space") +if [ "$out" != "arg with a space" ]; then + echo "error on space case"; exit 1; +fi + +out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "cmd" "arg with a space" --somearg '"blah blah"') +if [ "$out" != 'cmd arg with a space --somearg "blah blah"' ]; then + echo "error on harder space case"; exit 1; +fi + +out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap cat <&1| tee "${outfile}" + # RAPIDS_OTEL_WRAPPER is optionally passed in as an env var. It is + # used to instrument conda-build or mambabuild for finer-grained spans. + ${RAPIDS_OTEL_WRAPPER:-} ${condaCmd} ${args} 2>&1| tee "${outfile}" exitcode=$? needToRetry=0 needToClean=0 diff --git a/tools/rapids-get-telemetry-trace-id b/tools/rapids-get-telemetry-trace-id new file mode 100755 index 0000000..12a9e1b --- /dev/null +++ b/tools/rapids-get-telemetry-trace-id @@ -0,0 +1,11 @@ +#!/bin/bash +# This is a global, per-run identifier. It is the same across all jobs and all steps within all jobs. +# It is constant from the source repo, to shared-workflows, to shared-actions. + +if [ "$GITHUB_REPOSITORY" = "" ] || [ "${GITHUB_RUN_ID}" = "" ] || [ "${GITHUB_RUN_ATTEMPT}" = "" ]; then + echo "Error: one or more inputs to trace id is empty. The variables that must be set are:" + echo " GITHUB_REPOSITORY, GITHUB_RUN_ID, and GITHUB_RUN_ATTEMPT" + exit 1 +fi +sha="$(echo "${GITHUB_REPOSITORY}+${GITHUB_RUN_ID}+${GITHUB_RUN_ATTEMPT}" | sha256sum | cut -f1 -d' ')" +echo "${sha:0:32}" diff --git a/tools/rapids-get-telemetry-traceparent b/tools/rapids-get-telemetry-traceparent new file mode 100755 index 0000000..137f669 --- /dev/null +++ b/tools/rapids-get-telemetry-traceparent @@ -0,0 +1,38 @@ +#!/bin/bash +# This emits a TRACEPARENT, which follows the w3c trace context standard. +# https://www.w3.org/TR/trace-context/ +# +# This script can operate for two purposes: +# 1. The top level of a job, whether it is the job at the source repo (e.g. rmm) level, or +# the matrix job level +# 2. The steps level within a job, which uses both the job name and the step name +# +# The job name must always be provided as the first argument. +# A step name MAY be provided as the second argument. If it is specified, the output corresponds to +# the step within the context of its job. + +JOB_NAME=$1 +STEP_NAME=${2:-} + +if [ "$JOB_NAME" = "" ]; then + echo "ERROR: JOB_NAME (first parameter) is empty. This means your trace doesn't identify anything." + exit 1 +fi + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +TRACE_ID="$("${SCRIPT_DIR}"/rapids-get-telemetry-trace-id)" +JOB_SPAN_ID="${TRACE_ID}-${JOB_NAME}" +STEP_SPAN_ID="${JOB_SPAN_ID}-${STEP_NAME}" + +echo "JOB_SPAN_ID pre-hash: \"$JOB_SPAN_ID\"" 1>&2 +echo "STEP_SPAN_ID pre-hash: \"$STEP_SPAN_ID\"" 1>&2 + +JOB_TRACEPARENT=$(echo -n "${JOB_SPAN_ID}" | sha256sum | cut -f1 -d' ') +STEP_TRACEPARENT=$(echo -n "${STEP_SPAN_ID}" | sha256sum | cut -f1 -d' ') + +if [ "${STEP_NAME}" != "" ]; then + echo "00-${TRACE_ID}-${STEP_TRACEPARENT:0:16}-01" +else + echo "00-${TRACE_ID}-${JOB_TRACEPARENT:0:16}-01" +fi diff --git a/tools/rapids-otel-wrap b/tools/rapids-otel-wrap new file mode 100755 index 0000000..27ecc13 --- /dev/null +++ b/tools/rapids-otel-wrap @@ -0,0 +1,72 @@ +#!/bin/bash +# Wraps arbitrary commands with arbitrary args. Emits an OpenTelemetry span for tracing the command +# +set -x + +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +RAPIDS_OTEL_TRACES_EXPORTER="${RAPIDS_OTEL_TRACES_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}" +RAPIDS_OTEL_METRICS_EXPORTER="${RAPIDS_OTEL_METRICS_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}" +RAPIDS_OTEL_LOGS_EXPORTER="${RAPIDS_OTEL_LOGS_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}" +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/traces}" +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT="${OTEL_EXPORTER_OTLP_METRICS_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/metrics}" +OTEL_EXPORTER_OTLP_LOGS_ENDPOINT="${OTEL_EXPORTER_OTLP_LOGS_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/logs}" +export TRACEPARENT +export OTEL_SERVICE_NAME + +set -x + +if [[ $(type otel-cli >/dev/null 2>&1) -eq 0 ]] && [ "$TRACEPARENT" != "" ]; then + if [ -n "${RAPIDS_TRACE_DEBUG}" ]; then + rapids-echo-stderr "Running command with OpenTelemetry instrumentation"; + fi; + + if [ "$OTEL_SERVICE_NAME" = "" ]; then + rapids-echo-stderr "WARNING: OTEL_SERVICE_NAME variable not provided. Traces from different steps may not be associated correctly." + fi + + # Some commands have instrumentation. For example, conda-build has monkey-patched instrumentation + # that can be activated with the opentelemetry-instrument command. For these commands, + # we replace the command with the wrapped command, quoted as a whole for the purposes + # of otel-cli exec, so that flags don't get confused. + case "$1" in + conda* ) + if [ -n "${RAPIDS_TRACE_DEBUG}" ]; then + rapids-echo-stderr "using opentelemetry-instrument for command"; + rapids-echo-stderr "TRACEPARENT prior to otel-cli exec is: \"${TRACEPARENT}\""; + fi; + STEP_TRACEPARENT=$("${SCRIPT_DIR}/rapids-get-telemetry-traceparent" "${OTEL_SERVICE_NAME}" "${STEP_NAME}") + + # otel-cli creates a span for us that bridges the traceparent from the parent process + # shellcheck disable=SC2086,SC2048 + otel-cli exec \ + --name "Run instrumented \"$*\"" \ + --force-parent-span-id "$(cut -d'-' -f3 <<<"$STEP_TRACEPARENT")" \ + --verbose \ + -- \ + opentelemetry-instrument \ + "$@" + ;; + * ) + if [ -n "${RAPIDS_TRACE_DEBUG}" ]; then + rapids-echo-stderr "No opentelemetry instrumentation known for command $*"; + fi; + # shellcheck disable=SC2086,SC2048 + otel-cli exec \ + --name "Run instrumented \"$*\"" \ + --force-parent-span-id "$(cut -d'-' -f3 <<<"$STEP_TRACEPARENT")" \ + --verbose \ + -- "$@" + ;; + esac + RETURN_STATUS=$? +else + if [ -n "${RAPIDS_TRACE_DEBUG}" ]; then + rapids-echo-stderr "Telemetry disabled from lack of otel-cli on PATH or no TRACEPARENT set"; + rapids-echo-stderr "Running command unmodified"; + fi; + "$@" + RETURN_STATUS=$? +fi + +exit "${RETURN_STATUS}"