diff --git a/test/tarantool-c-tests/gh-8594-sysprof-ffunc-crash.test.c b/test/tarantool-c-tests/gh-8594-sysprof-ffunc-crash.test.c new file mode 100644 index 0000000000..cb356eee4d --- /dev/null +++ b/test/tarantool-c-tests/gh-8594-sysprof-ffunc-crash.test.c @@ -0,0 +1,260 @@ +#include "lauxlib.h" +#include "lmisclib.h" +#include "lua.h" + +#include "test.h" +#include "utils.h" + +#include +#include +#include +#include +#include + +/* XXX: Still need normal assert inside and helpers. */ +#undef NDEBUG +#include + +/* XXX: Need for skipcond for OS and ARCH. */ +#include "lj_arch.h" + +/* + * XXX: The test makes sysprof to collect the particular event + * (FFUNC) at the particular instruction () to + * reproduce the issue #8594. Hence it's enough to call + * fast function (this is done in function). To emit + * SIGPROF right at in scope of fast + * function, the managed execution is implemented in the function + * : instruction is poisoned as the first + * instruction at to stop at the + * beginning of the fast function; resumes the ; + * the same hack is done for . When the hits + * the interruption at , SIGPROF is emitted while + * resuming the . As a result sysprof collects the full + * backtrace with fast function as the topmost frame. + * + * See more info here: + * * https://man7.org/linux/man-pages/man2/ptrace.2.html + * * https://github.com/tarantool/tarantool/issues/8594 + * * https://github.com/tarantool/tarantool/issues/9387 + */ + +#define MESSAGE "Canary is alive" +#define LUACALL "local a = tostring('" MESSAGE "') return a" +/* XXX: Resolve the necessary addresses from VM engine. */ +extern void *lj_ff_tostring(void); +extern void *lj_fff_res1(void); + +/* Sysprof "/dev/null" stream helpers. {{{ */ + +/* + * Yep, 8Mb. Tuned in order not to bother the platform with too + * often flushes. + */ +#define STREAM_BUFFER_SIZE (8 * 1024 * 1024) +#define DEVNULL -1 + +struct devnull_ctx { + /* + * XXX: Dummy file descriptor to be used as "/dev/null" + * context indicator in writer and on_stop callback. + */ + int fd; + /* Buffer for data recorded by sysprof. */ + uint8_t buf[STREAM_BUFFER_SIZE]; +}; + +static int stream_new(struct luam_Sysprof_Options *options) { + struct devnull_ctx *ctx = calloc(1, sizeof(struct devnull_ctx)); + if (ctx == NULL) + return PROFILE_ERRIO; + + /* Set "/dev/null" context indicator. */ + ctx->fd = DEVNULL; + options->ctx = ctx; + options->buf = ctx->buf; + options->len = STREAM_BUFFER_SIZE; + + return PROFILE_SUCCESS; +} + +static int stream_delete(void *rawctx, uint8_t *buf) { + struct devnull_ctx *ctx = rawctx; + assert(ctx->fd == DEVNULL); + free(ctx); + return PROFILE_SUCCESS; +} + +static size_t stream_writer(const void **buf_addr, size_t len, void *rawctx) { + struct devnull_ctx *ctx = rawctx; + assert(ctx->fd == DEVNULL); + /* Do nothing, just return back to the profiler. */ + return STREAM_BUFFER_SIZE; +} + +/* }}} Sysprof "/dev/null" stream helpers. */ + +static int tracee(const char *luacode) { + struct luam_Sysprof_Counters counters = {}; + struct luam_Sysprof_Options opt = { + /* Collect full backtraces per event. */ + .mode = LUAM_SYSPROF_CALLGRAPH, + /* + * XXX: Setting the "endless timer". The test + * requires the single event to be streamed at + * instruction, so to avoid spoiling + * the stream with other unwanted events, the + * timer is set to some unreachable point, so the + * profiler will be guaranteed to stop before any + * event is emitted. + */ + .interval = -1ULL, + }; + + /* Allow tracing for this process */ + if (ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) { + perror("Failed to turn the calling thread into a tracee"); + return EXIT_FAILURE; + } + + /* + * XXX: Allow parent (which is our tracer now) to observe + * our signal-delivery-stop (i.e. the tracee is ready). + * For more info see ptrace(2), "Attaching and detaching". + */ + raise(SIGSTOP); + + lua_State *L = utils_lua_init(); + + /* Customize and start profiler. */ + assert(stream_new(&opt) == PROFILE_SUCCESS); + assert(luaM_sysprof_set_writer(stream_writer) == PROFILE_SUCCESS); + assert(luaM_sysprof_set_on_stop(stream_delete) == PROFILE_SUCCESS); + assert(luaM_sysprof_start(L, &opt) == PROFILE_SUCCESS); + + /* FIXME: Make this part test agnostic. */ + assert(luaL_dostring(L, luacode) == LUA_OK); + assert(strcmp(lua_tostring(L, -1), MESSAGE) == 0); + + /* Terminate profiler and Lua universe. */ + assert(luaM_sysprof_stop(L) == PROFILE_SUCCESS); + utils_lua_close(L); + + /* + * XXX: The only event to be streamed must be FFUNC at + * instruction. + * FIXME: Make this part test agnostic. + */ + assert(luaM_sysprof_report(&counters) == PROFILE_SUCCESS); + assert(counters.samples == 1); + assert(counters.vmst_ffunc == 1); + + return EXIT_SUCCESS; +} + +const uint8_t INT3 = 0xCC; +static inline unsigned long int3poison(unsigned long instruction) { + const size_t int3bits = sizeof(INT3) * 8; + const unsigned long int3mask = -1UL >> int3bits << int3bits; + return (instruction & int3mask) | INT3; +} + +static int continue_until(pid_t chpid, void *addr) { + int wstatus; + struct user_regs_struct regs; + + /* Obtain the instructions at the . */ + unsigned long data = ptrace(PTRACE_PEEKTEXT, chpid, addr, 0); + /* + * Emit the instruction to the . + * XXX: is poisoned as the LSB to the + * obtained from the above. + */ + ptrace(PTRACE_POKETEXT, chpid, addr, int3poison(data)); + + /* Resume tracee until SIGTRAP occurs. */ + ptrace(PTRACE_CONT, chpid, 0, 0); + /* Wait tracee signal-delivery-stop. */ + waitpid(chpid, &wstatus, 0); + + /* Obtain GPR set to tweak RIP for further execution. */ + ptrace(PTRACE_GETREGS, chpid, 0, ®s); + /* + * Make sure we indeed are stopped at . + * XXX: RIP points right after instruction. + */ + assert(regs.rip == (long)addr + sizeof(INT3)); + + /* + * XXX: Restore the original instruction at and + * "rewind" RIP by size to "replay" the poisoned + * instruction at the . + */ + regs.rip -= sizeof(INT3); + ptrace(PTRACE_SETREGS, chpid, 0, ®s); + ptrace(PTRACE_POKETEXT, chpid, addr, data); + + /* Return wait status to the caller for test checks. */ + return wstatus; +} + +static int tracer(pid_t chpid) { + int wstatus; + + /* Wait until tracee is ready. */ + waitpid(chpid, &wstatus, 0); + + /* Resume tracee until . */ + wstatus = continue_until(chpid, lj_ff_tostring); + + /* The tracee has to be alive and stopped by SIGTRAP. */ + assert_false(WIFEXITED(wstatus)); + assert_true(WIFSTOPPED(wstatus)); + + /* Resume tracee until . */ + wstatus = continue_until(chpid, lj_fff_res1); + + /* The tracee has to be alive and stopped by SIGTRAP. */ + assert_false(WIFEXITED(wstatus)); + assert_true(WIFSTOPPED(wstatus)); + + /* Send SIGPROF to make sysprof collect the event. */ + ptrace(PTRACE_CONT, chpid, 0, SIGPROF); + + /* Wait until tracee successfully exits. */ + waitpid(chpid, &wstatus, 0); + assert_true(WIFEXITED(wstatus)); + + return TEST_EXIT_SUCCESS; +} + +static int test_tostring_call(void *ctx) { + pid_t chpid = fork(); + switch(chpid) { + case -1: + bail_out("Tracee fork failed"); + case 0: + /* + * XXX: Tracee has to instead of + * to avoid duplicate reports in . + * Test assertions are used only in the , + * so the ought to report whether the + * test succeeded or not. + */ + exit(tracee(LUACALL)); + default: + return tracer(chpid); + } +} + +int main(void) { + if (LUAJIT_OS != LUAJIT_OS_LINUX) + return skip_all("Sysprof is implemented for Linux only"); + if (LUAJIT_TARGET != LUAJIT_ARCH_X64) + return skip_all("Sysprof is implemented for x86_64 only"); + + const struct test_unit tgroup[] = { + test_unit_def(test_tostring_call), + }; + return test_run_group(tgroup, NULL); +} diff --git a/test/tarantool-tests/gh-8594-sysprof-ffunc-crash.test.lua b/test/tarantool-tests/gh-8594-sysprof-ffunc-crash.test.lua deleted file mode 100644 index f8b409ae97..0000000000 --- a/test/tarantool-tests/gh-8594-sysprof-ffunc-crash.test.lua +++ /dev/null @@ -1,55 +0,0 @@ -local tap = require('tap') -local test = tap.test('gh-8594-sysprof-ffunc-crash'):skipcond({ - ['Sysprof is implemented for x86_64 only'] = jit.arch ~= 'x86' and - jit.arch ~= 'x64', - ['Sysprof is implemented for Linux only'] = jit.os ~= 'Linux', - -- luacheck: no global - ['Prevent hanging Tarantool CI due to #9387'] = _TARANTOOL, -}) - -test:plan(1) - -jit.off() --- XXX: Run JIT tuning functions in a safe frame to avoid errors --- thrown when LuaJIT is compiled with JIT engine disabled. -pcall(jit.flush) - -local TMP_BINFILE = '/dev/null' - --- XXX: The best way to test the issue is to set the profile --- interval to be as short as possible. However, our CI is --- not capable of handling such intense testing, so it was a --- forced decision to reduce the sampling frequency for it. As a --- result, it is now less likely to reproduce the issue --- statistically, but the test case is still valid. - --- GitHub always sets[1] the `CI` environment variable to `true` --- for every step in a workflow. --- [1]: https://docs.github.com/en/actions/learn-github-actions/variables#default-environment-variables -local CI = os.getenv('CI') == 'true' - --- Profile interval and number of iterations for CI are --- empirical. Non-CI profile interval is set to be as short --- as possible, so the issue is more likely to reproduce. --- Non-CI number of iterations is greater for the same reason. -local PROFILE_INTERVAL = CI and 3 or 1 -local N_ITERATIONS = CI and 1e5 or 1e6 - -local res, err = misc.sysprof.start{ - mode = 'C', - interval = PROFILE_INTERVAL, - path = TMP_BINFILE, -} -assert(res, err) - -for i = 1, N_ITERATIONS do - -- XXX: `tostring` is FFUNC. - tostring(i) -end - -res, err = misc.sysprof.stop() -assert(res, err) - -test:ok(true, 'FFUNC frames were streamed correctly') - -os.exit(test:check() and 0 or 1)