Skip to content

Commit

Permalink
watchdog: improve the behavior how watchdog is handled
Browse files Browse the repository at this point in the history
* dlt_daemon_process_user_messages
* dlt_daemon_send_ringbuffer_to_client
* dlt_daemon_client_send_all_multiple
* dlt_daemon_process_systemd_timer
* dlt_daemon_handle_event
* dlt_daemon_socket_sendreliable

These methods can take longer than the systemd watchdog timeout allowed, depending on the number of clients, which leads that the systemd watchdog kills dlt-daemon.
Add a timeout to these methods so that they may only run for as long as the watchdog timer is set.
In addition, set the default WatchdogTimeout to an appropriate value in case the env variable is not set.

Signed-off-by: Ahmet Findikci <[email protected]>
  • Loading branch information
ahmetfindi committed Jan 5, 2025
1 parent 80371aa commit 1b4def1
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 21 deletions.
37 changes: 28 additions & 9 deletions src/daemon/dlt-daemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -1549,14 +1549,25 @@ int main(int argc, char *argv[])
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
{
char *watchdogUSec = getenv("WATCHDOG_USEC");
int watchdogTimeoutSeconds = 0;
// set a sensible default, in case the environment variable is not set
int watchdogTimeoutSeconds = 30;

dlt_log(LOG_DEBUG, "Systemd watchdog initialization\n");

if (watchdogUSec)
if (watchdogUSec) {
// WATCHDOG_USEC is the timeout in micrsoseconds
// divide this by 2*10^6 to get the interval in seconds
// 2 * because we notify systemd after half the timeout
watchdogTimeoutSeconds = atoi(watchdogUSec) / 2000000;
}

if (watchdogTimeoutSeconds == 0) {
dlt_log(LOG_WARNING, "Watchdog timeout is too small, need at least 1s, setting 1s timeout\n");
watchdogTimeoutSeconds = 1;
}

daemon.watchdog_trigger_interval = watchdogTimeoutSeconds;
daemon.watchdog_last_trigger_time = 0U;
create_timer_fd(&daemon_local,
watchdogTimeoutSeconds,
watchdogTimeoutSeconds,
Expand Down Expand Up @@ -3152,6 +3163,17 @@ int dlt_daemon_process_user_messages(DltDaemon *daemon,

/* look through buffer as long as data is in there */
while ((receiver->bytesRcvd >= min_size) && run_loop) {
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
/* this loop may be running long, so we have to exit it at some point to be able to
* to process other events, like feeding the watchdog
*/
bool watchdog_triggered= dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
if (watchdog_triggered) {
dlt_vlog(LOG_WARNING, "%s yields due to watchdog.\n", __func__);
run_loop = 0; // exit loop in next iteration
}
#endif

dlt_daemon_process_user_message_func func = NULL;

offset = 0;
Expand Down Expand Up @@ -3798,12 +3820,9 @@ int dlt_daemon_process_user_message_log(DltDaemon *daemon,

while (1) {
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
const unsigned int uptime = dlt_uptime();
if ((uptime - start_time) / 10000 > daemon->watchdog_trigger_interval) {
dlt_vlog(LOG_WARNING,
"spent already 1 watchdog trigger interval in %s, yielding to process other events.\n", __func__);
if (sd_notify(0, "WATCHDOG=1") < 0)
dlt_vlog(LOG_CRIT, "Could not reset systemd watchdog from %s\n", __func__);
bool watchdog_triggered = dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
if (watchdog_triggered) {
dlt_vlog(LOG_WARNING, "%s yields due to watchdog.\n", __func__);
break;
}
#endif
Expand Down Expand Up @@ -4162,7 +4181,7 @@ int dlt_daemon_send_ringbuffer_to_client(DltDaemon *daemon, DltDaemonLocal *daem

while ((length = dlt_buffer_copy(&(daemon->client_ringbuffer), data, sizeof(data))) > 0) {
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
dlt_daemon_trigger_systemd_watchdog_if_necessary(&curr_time, daemon->watchdog_trigger_interval);
dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
#endif

if ((ret =
Expand Down
14 changes: 10 additions & 4 deletions src/daemon/dlt_daemon_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon,
int verbose)
{
int sent = 0;
unsigned int i = 0;
nfds_t i = 0;
int ret = 0;
DltConnection *temp = NULL;
int type_mask =
Expand All @@ -125,6 +125,13 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon,

for (i = 0; i < daemon_local->pEvent.nfds; i++)
{
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
bool watchdog_triggered = dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
if (watchdog_triggered) {
dlt_vlog(LOG_WARNING, "%s notified watchdog, processed %lu/%lu fds already.\n",
__func__, i, daemon_local->pEvent.nfds);
}
#endif
temp = dlt_event_handler_find_connection(&(daemon_local->pEvent),
daemon_local->pEvent.pfd[i].fd);

Expand Down Expand Up @@ -152,7 +159,7 @@ static int dlt_daemon_client_send_all_multiple(DltDaemon *daemon,
if (ret != DLT_DAEMON_ERROR_OK)
dlt_vlog(LOG_WARNING, "%s: send dlt message failed\n", __func__);
else
/* If sent to at least one client,
/* If sent to at least one client,
* then do not store in ring buffer
*/
sent = 1;
Expand Down Expand Up @@ -2398,8 +2405,7 @@ int dlt_daemon_process_systemd_timer(DltDaemon *daemon,
daemon->received_message_since_last_watchdog_interval = 0;
#endif

if (sd_notify(0, "WATCHDOG=1") < 0)
dlt_log(LOG_CRIT, "Could not reset systemd watchdog\n");
dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);

dlt_log(LOG_DEBUG, "Timer watchdog\n");

Expand Down
22 changes: 16 additions & 6 deletions src/daemon/dlt_daemon_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -2002,14 +2002,24 @@ void dlt_daemon_change_state(DltDaemon *daemon, DltDaemonState newState)
}

#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
void dlt_daemon_trigger_systemd_watchdog_if_necessary(unsigned int* last_trigger_time, unsigned int watchdog_trigger_interval) {
unsigned int uptime = dlt_uptime();
if ((uptime - *last_trigger_time) / 10000 >= watchdog_trigger_interval) {
if (sd_notify(0, "WATCHDOG=1") < 0)
dlt_vlog(LOG_WARNING, "%s: Could not reset systemd watchdog\n", __func__);
*last_trigger_time = uptime;
bool dlt_daemon_trigger_systemd_watchdog_if_necessary(DltDaemon *daemon) {
if (daemon->watchdog_trigger_interval == 0) {
return false;
}

const unsigned int uptime_seconds = dlt_uptime() / 10000;
const unsigned int seconds_since_last_trigger = uptime_seconds - daemon->watchdog_last_trigger_time;
if (seconds_since_last_trigger < daemon->watchdog_trigger_interval) {
return false;
}
if (sd_notify(0, "WATCHDOG=1") < 0)
dlt_vlog(LOG_WARNING, "%s: Could not reset systemd watchdog\n", __func__);
else
daemon->watchdog_last_trigger_time = uptime_seconds;

return true;
}

#endif

#ifdef DLT_TRACE_LOAD_CTRL_ENABLE
Expand Down
8 changes: 7 additions & 1 deletion src/daemon/dlt_daemon_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ typedef struct
#endif
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
unsigned int watchdog_trigger_interval; /* watchdog trigger interval in [s] */
unsigned int watchdog_last_trigger_time; /* when the watchdog was last triggered in [s] */
#endif
#ifdef DLT_LOG_LEVEL_APP_CONFIG
DltDaemonContextLogSettings *app_id_log_level_settings; /**< Settings for app id specific log levels */
Expand Down Expand Up @@ -606,7 +607,12 @@ void dlt_daemon_control_reset_to_factory_default(DltDaemon *daemon,
void dlt_daemon_change_state(DltDaemon *daemon, DltDaemonState newState);

#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
void dlt_daemon_trigger_systemd_watchdog_if_necessary(unsigned int* last_trigger_time, unsigned int watchdog_trigger_interval);
/**
* Trigger the systemd watchdog when the timeout has been reached
* @param daemon pointer to dlt daemon structure
* @return true if the watchdog has been triggered
*/
bool dlt_daemon_trigger_systemd_watchdog_if_necessary(DltDaemon *daemon);
#endif

# ifdef __cplusplus
Expand Down
4 changes: 3 additions & 1 deletion src/daemon/dlt_daemon_event_handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,9 @@ int dlt_daemon_handle_event(DltEventHandler *pEvent,
return -1;
}
#ifdef DLT_SYSTEMD_WATCHDOG_ENABLE
dlt_daemon_trigger_systemd_watchdog_if_necessary(&start_time, daemon->watchdog_trigger_interval);
// no need to yield here, it will be called in a loop anyways.
// therefore we also do not log.
dlt_daemon_trigger_systemd_watchdog_if_necessary(daemon);
#endif
}

Expand Down

0 comments on commit 1b4def1

Please sign in to comment.