From fc1fb32851e400a5f84d60c71716f7c39371d0c2 Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Mon, 21 Oct 2024 15:22:07 +0300 Subject: [PATCH] spawn-server-nofork: invalid magic (#18831) * debug code to trace the issue * detect the libsystemd socket and keep it open when forking * fix the path * evaluate 2k fds * increase to 1k pad buffer * detect it in a different way * cleanup * use the same journal direct socket detection * removed debugging paddding --- src/libnetdata/log/nd_log-init.c | 28 +++---- src/libnetdata/log/nd_log-internals.c | 2 + src/libnetdata/log/nd_log-internals.h | 4 +- .../log/nd_log-to-systemd-journal.c | 78 ++++++++++++------- src/libnetdata/log/nd_log.h | 1 + .../spawn_server/spawn_server_nofork.c | 75 +++++++++++++----- 6 files changed, 122 insertions(+), 66 deletions(-) diff --git a/src/libnetdata/log/nd_log-init.c b/src/libnetdata/log/nd_log-init.c index 4ce42547bb9450..7f846b136386a3 100644 --- a/src/libnetdata/log/nd_log-init.c +++ b/src/libnetdata/log/nd_log-init.c @@ -44,7 +44,7 @@ void nd_log_initialize_for_external_plugins(const char *name) { program_name = name; for(size_t i = 0; i < _NDLS_MAX ;i++) { - nd_log.sources[i].method = STDERR_FILENO; + nd_log.sources[i].method = NDLM_DEFAULT; nd_log.sources[i].fd = -1; nd_log.sources[i].fp = NULL; } @@ -123,12 +123,10 @@ void nd_log_initialize_for_external_plugins(const char *name) { break; } - for(size_t i = 0; i < _NDLS_MAX ;i++) { - nd_log.sources[i].method = method; - nd_log.sources[i].format = format; - nd_log.sources[i].fd = -1; - nd_log.sources[i].fp = NULL; - } + nd_log.sources[NDLS_COLLECTORS].method = method; + nd_log.sources[NDLS_COLLECTORS].format = format; + nd_log.sources[NDLS_COLLECTORS].fd = -1; + nd_log.sources[NDLS_COLLECTORS].fp = NULL; // nd_log(NDLS_COLLECTORS, NDLP_NOTICE, "FINAL_LOG_METHOD: %s", nd_log_id2method(method)); } @@ -271,6 +269,10 @@ void nd_log_reopen_log_files(bool log) { netdata_log_info("Log files re-opened."); } +int nd_log_systemd_journal_fd(void) { + return nd_log.journal.fd; +} + void nd_log_reopen_log_files_for_spawn_server(const char *name) { gettid_uncached(); @@ -284,13 +286,11 @@ void nd_log_reopen_log_files_for_spawn_server(const char *name) { close(nd_log.journal_direct.fd); nd_log.journal_direct.fd = -1; nd_log.journal_direct.initialized = false; - nd_log_journal_direct_init(NULL); } for(size_t i = 0; i < _NDLS_MAX ;i++) { - if(i != NDLS_COLLECTORS && i != NDLS_DAEMON) continue; - spinlock_init(&nd_log.sources[i].spinlock); + nd_log.sources[i].method = NDLM_DEFAULT; nd_log.sources[i].fd = -1; nd_log.sources[i].fp = NULL; nd_log.sources[i].pending_msg = NULL; @@ -299,16 +299,10 @@ void nd_log_reopen_log_files_for_spawn_server(const char *name) { #endif } - for(size_t i = 0; i < _NDLS_MAX ;i++) { - if(i == NDLS_COLLECTORS || i == NDLS_DAEMON) continue; - nd_log.sources[i].method = NDLM_DISABLED; - } - + // initialize spinlocks spinlock_init(&nd_log.std_output.spinlock); spinlock_init(&nd_log.std_error.spinlock); - nd_log.journal.initialized = false; - nd_log.journal_direct.initialized = false; nd_log.syslog.initialized = false; nd_log.eventlog.initialized = false; nd_log.std_output.initialized = false; diff --git a/src/libnetdata/log/nd_log-internals.c b/src/libnetdata/log/nd_log-internals.c index cb26b816e9f149..97f521fad999b8 100644 --- a/src/libnetdata/log/nd_log-internals.c +++ b/src/libnetdata/log/nd_log-internals.c @@ -284,6 +284,8 @@ struct nd_log nd_log = { .overwrite_process_source = 0, .journal = { .initialized = false, + .first_msg = false, + .fd = -1, }, .journal_direct = { .initialized = false, diff --git a/src/libnetdata/log/nd_log-internals.h b/src/libnetdata/log/nd_log-internals.h index 03ac40405f69a3..1a761401bfdf8d 100644 --- a/src/libnetdata/log/nd_log-internals.h +++ b/src/libnetdata/log/nd_log-internals.h @@ -133,12 +133,14 @@ struct nd_log { struct { bool initialized; + bool first_msg; + int fd; // we don't control this, we just detect it to keep it open } journal; struct { bool initialized; int fd; - char filename[FILENAME_MAX + 1]; + char filename[FILENAME_MAX]; } journal_direct; struct { diff --git a/src/libnetdata/log/nd_log-to-systemd-journal.c b/src/libnetdata/log/nd_log-to-systemd-journal.c index b574e693cfdd64..7489e13cdd9e6e 100644 --- a/src/libnetdata/log/nd_log-to-systemd-journal.c +++ b/src/libnetdata/log/nd_log-to-systemd-journal.c @@ -12,18 +12,36 @@ bool nd_log_journal_systemd_init(void) { return nd_log.journal.initialized; } -bool nd_log_journal_socket_available(void) { - if(netdata_configured_host_prefix && *netdata_configured_host_prefix) { - char filename[FILENAME_MAX + 1]; +static int nd_log_journal_direct_fd_find_and_open(char *filename, size_t size) { + int fd; - snprintfz(filename, sizeof(filename), "%s%s", - netdata_configured_host_prefix, "/run/systemd/journal/socket"); + if(netdata_configured_host_prefix && *netdata_configured_host_prefix) { + journal_construct_path(filename, size, netdata_configured_host_prefix, "netdata"); + if (is_path_unix_socket(filename) && (fd = journal_direct_fd(filename)) != -1) + return fd; - if(is_path_unix_socket(filename)) - return true; + journal_construct_path(filename, size, netdata_configured_host_prefix, NULL); + if (is_path_unix_socket(filename) && (fd = journal_direct_fd(filename)) != -1) + return fd; } - return is_path_unix_socket("/run/systemd/journal/socket"); + journal_construct_path(filename, size, NULL, "netdata"); + if (is_path_unix_socket(filename) && (fd = journal_direct_fd(filename)) != -1) + return fd; + + journal_construct_path(filename, size, NULL, NULL); + if (is_path_unix_socket(filename) && (fd = journal_direct_fd(filename)) != -1) + return fd; + + return -1; +} + +bool nd_log_journal_socket_available(void) { + char filename[FILENAME_MAX]; + int fd = nd_log_journal_direct_fd_find_and_open(filename, sizeof(filename)); + if(fd == -1) return false; + close(fd); + return true; } static void nd_log_journal_direct_set_env(void) { @@ -38,25 +56,9 @@ bool nd_log_journal_direct_init(const char *path) { } int fd; - char filename[FILENAME_MAX + 1]; - if(!is_path_unix_socket(path)) { - - journal_construct_path(filename, sizeof(filename), netdata_configured_host_prefix, "netdata"); - if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) { - - journal_construct_path(filename, sizeof(filename), netdata_configured_host_prefix, NULL); - if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) { - - journal_construct_path(filename, sizeof(filename), NULL, "netdata"); - if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) { - - journal_construct_path(filename, sizeof(filename), NULL, NULL); - if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) - return false; - } - } - } - } + char filename[FILENAME_MAX]; + if(!is_path_unix_socket(path)) + fd = nd_log_journal_direct_fd_find_and_open(filename, sizeof(filename)); else { snprintfz(filename, sizeof(filename), "%s", path); fd = journal_direct_fd(filename); @@ -74,6 +76,8 @@ bool nd_log_journal_direct_init(const char *path) { return true; } +static bool sockets_before[1024]; + bool nd_logger_journal_libsystemd(struct log_field *fields __maybe_unused, size_t fields_max __maybe_unused) { #ifdef HAVE_SYSTEMD @@ -154,8 +158,28 @@ bool nd_logger_journal_libsystemd(struct log_field *fields __maybe_unused, size_ } } + bool detect_systemd_socket = __atomic_load_n(&nd_log.journal.first_msg, __ATOMIC_RELAXED) == false; + if(detect_systemd_socket) { + for(int i = 3 ; (size_t)i < _countof(sockets_before); i++) + sockets_before[i] = fd_is_socket(i); + } + int r = sd_journal_sendv(iov, iov_count); + if(r == 0 && detect_systemd_socket) { + __atomic_store_n(&nd_log.journal.first_msg, true, __ATOMIC_RELAXED); + + // this is the first successful libsystemd log + // let's detect its fd number (we need it for the spawn server) + + for(int i = 3 ; (size_t)i < _countof(sockets_before); i++) { + if (!sockets_before[i] && fd_is_socket(i)) { + nd_log.journal.fd = i; + break; + } + } + } + // Clean up allocated memory for (int i = 0; i < iov_count; i++) { if (iov[i].iov_base != NULL) { diff --git a/src/libnetdata/log/nd_log.h b/src/libnetdata/log/nd_log.h index 8f501c7e40bac5..1fefbe32831e15 100644 --- a/src/libnetdata/log/nd_log.h +++ b/src/libnetdata/log/nd_log.h @@ -14,6 +14,7 @@ extern "C" { #define ND_LOG_DEFAULT_THROTTLE_PERIOD 60 void errno_clear(void); +int nd_log_systemd_journal_fd(void); void nd_log_set_user_settings(ND_LOG_SOURCES source, const char *setting); void nd_log_set_facility(const char *facility); void nd_log_set_priority_level(const char *setting); diff --git a/src/libnetdata/spawn_server/spawn_server_nofork.c b/src/libnetdata/spawn_server/spawn_server_nofork.c index ede39d0579bf22..9986740debd21c 100644 --- a/src/libnetdata/spawn_server/spawn_server_nofork.c +++ b/src/libnetdata/spawn_server/spawn_server_nofork.c @@ -274,7 +274,13 @@ static bool spawn_external_command(SPAWN_SERVER *server __maybe_unused, SPAWN_RE return false; } - os_close_all_non_std_open_fds_except(rq->fds, 3, CLOSE_RANGE_CLOEXEC); + int fds_to_keep[] = { + rq->fds[0], + rq->fds[1], + rq->fds[2], + nd_log_systemd_journal_fd(), + }; + os_close_all_non_std_open_fds_except(fds_to_keep, _countof(fds_to_keep), CLOSE_RANGE_CLOEXEC); errno_clear(); if (posix_spawn(&rq->pid, rq->argv[0], &file_actions, &attr, (char * const *)rq->argv, (char * const *)rq->envp) != 0) { @@ -325,7 +331,14 @@ static bool spawn_server_run_callback(SPAWN_SERVER *server __maybe_unused, SPAWN os_setproctitle("spawn-callback", server->argc, server->argv); // close all open file descriptors of the parent, but keep ours - os_close_all_non_std_open_fds_except(rq->fds, 4, 0); + int fds_to_keep[] = { + rq->fds[0], + rq->fds[1], + rq->fds[2], + rq->fds[3], + nd_log_systemd_journal_fd(), + }; + os_close_all_non_std_open_fds_except(fds_to_keep, _countof(fds_to_keep), 0); nd_log_reopen_log_files_for_spawn_server("spawn-callback"); // get the fds from the request @@ -1076,7 +1089,12 @@ SPAWN_SERVER* spawn_server_create(SPAWN_SERVER_OPTIONS options, const char *name os_setproctitle(buf, server->argc, server->argv); replace_stdio_with_dev_null(); - os_close_all_non_std_open_fds_except((int[]){ server->sock, server->pipe[1] }, 2, 0); + int fds_to_keep[] = { + server->sock, + server->pipe[1], + nd_log_systemd_journal_fd(), + }; + os_close_all_non_std_open_fds_except(fds_to_keep, _countof(fds_to_keep), 0); nd_log_reopen_log_files_for_spawn_server(buf); exit(spawn_server_event_loop(server)); } @@ -1125,6 +1143,21 @@ void spawn_server_exec_destroy(SPAWN_INSTANCE *instance) { freez(instance); } +static void log_invalid_magic(SPAWN_INSTANCE *instance, struct status_report *sr) { + unsigned char buf[sizeof(*sr) + 1]; + memcpy(buf, sr, sizeof(*sr)); + buf[sizeof(buf) - 1] = '\0'; + + for(size_t i = 0; i < sizeof(buf) - 1; i++) { + if (iscntrl(buf[i]) || !isprint(buf[i])) + buf[i] = '_'; + } + + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: invalid final status report for child %d, request %zu (invalid magic %#x in response, reads like '%s')", + instance->child_pid, instance->request_id, sr->magic, buf); +} + int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE *instance) { int rc = -1; @@ -1139,24 +1172,24 @@ int spawn_server_exec_wait(SPAWN_SERVER *server __maybe_unused, SPAWN_INSTANCE * "SPAWN PARENT: failed to read final status report for child %d, request %zu", instance->child_pid, instance->request_id); - else if(sr.magic != STATUS_REPORT_MAGIC) { - nd_log(NDLS_COLLECTORS, NDLP_ERR, - "SPAWN PARENT: invalid final status report for child %d, request %zu (invalid magic %#x in response)", - instance->child_pid, instance->request_id, sr.magic); - } - else switch(sr.status) { - case STATUS_REPORT_EXITED: - rc = sr.exited.waitpid_status; - break; - - case STATUS_REPORT_STARTED: - case STATUS_REPORT_FAILED: - default: - errno = 0; - nd_log(NDLS_COLLECTORS, NDLP_ERR, - "SPAWN PARENT: invalid status report to exec spawn request %zu for pid %d (status = %u)", - instance->request_id, instance->child_pid, sr.status); - break; + else if(sr.magic != STATUS_REPORT_MAGIC) + log_invalid_magic(instance, &sr); + else { + switch (sr.status) { + case STATUS_REPORT_EXITED: + rc = sr.exited.waitpid_status; + break; + + case STATUS_REPORT_STARTED: + case STATUS_REPORT_FAILED: + default: + errno = 0; + nd_log( + NDLS_COLLECTORS, NDLP_ERR, + "SPAWN PARENT: invalid status report to exec spawn request %zu for pid %d (status = %u)", + instance->request_id, instance->child_pid, sr.status); + break; + } } instance->child_pid = 0;