Skip to content

Commit

Permalink
runner: add one timer for each libtcmu cmd
Browse files Browse the repository at this point in the history
Currently we have hit the problem that tcmu-runner couldn't get any
response from the gluster backend for many times in the product, and
there is no any explict evidence from the logs.

This patch will add one timer for each cmd and will record all the
pending cmds for each tcmu device in 5/10/15/20/25/30+ seconds.

Signed-off-by: Xiubo Li <[email protected]>
  • Loading branch information
lxbsz committed Feb 12, 2019
1 parent 2c877c2 commit cf50d18
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 9 deletions.
18 changes: 18 additions & 0 deletions libtcmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#define _GNU_SOURCE
#include <memory.h>
#include <inttypes.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
Expand Down Expand Up @@ -530,6 +531,9 @@ static int add_device(struct tcmulib_context *ctx, char *dev_name,
}

darray_append(ctx->devices, dev);
pthread_mutex_lock(&pending_cmds_lock);
list_add_tail(&pending_cmds_head, &dev->entry);
pthread_mutex_unlock(&pending_cmds_lock);

if (reopen && reset_supp)
tcmu_cfgfs_dev_exec_action(dev, "block_dev", 0);
Expand Down Expand Up @@ -583,6 +587,9 @@ static void remove_device(struct tcmulib_context *ctx, char *dev_name,
}

darray_remove(ctx->devices, i);
pthread_mutex_lock(&pending_cmds_lock);
list_del_init(&dev->entry);
pthread_mutex_unlock(&pending_cmds_lock);

dev->handler->removed(dev);

Expand Down Expand Up @@ -959,6 +966,9 @@ struct tcmulib_cmd *tcmulib_get_next_command(struct tcmu_device *dev)
if (!cmd)
return NULL;
cmd->cmd_id = ent->hdr.cmd_id;
cmd->dev = dev;
cmd->timer = NULL;
cmd->timeout = 0;

/* Convert iovec addrs in-place to not be offsets */
cmd->iov_cnt = ent->req.iov_cnt;
Expand Down Expand Up @@ -1119,6 +1129,11 @@ void tcmulib_command_complete(
struct tcmu_mailbox *mb = dev->map;
struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail;

if (cmd->timeout) {
dev->timeout_cmds[cmd->timeout / CMD_TO_STEP - 1]--;
pthread_cond_signal(&pending_cmds_cond);
}

/* current command could be PAD in async case */
while (ent != (void *) mb + mb->cmdr_off + mb->cmd_head) {
if (tcmu_hdr_get_op(ent->hdr.len_op) == TCMU_OP_CMD)
Expand All @@ -1139,6 +1154,9 @@ void tcmulib_command_complete(
}

TCMU_UPDATE_RB_TAIL(mb, ent);

if (cmd->timer)
free(cmd->timer);
free(cmd);
}

Expand Down
4 changes: 4 additions & 0 deletions libtcmu_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ struct tcmulib_cmd {

/* callback to finish/continue command processing */
cmd_done_t done;

struct tcmu_device *dev;
struct tcmu_timer *timer;
uint16_t timeout;
};

/* Set/Get methods for the opaque tcmu_device */
Expand Down
48 changes: 42 additions & 6 deletions libtcmu_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ log_internal(int pri, struct tcmu_device *dev, const char *funcname,
int linenr, const char *fmt, va_list args)
{
char buf[LOG_MSG_LEN];
int n;
int n = 0;
struct tcmulib_handler *handler;

if (pri > tcmu_log_level)
Expand All @@ -236,13 +236,14 @@ log_internal(int pri, struct tcmu_device *dev, const char *funcname,
}

/* Format the log msg */
if (funcname)
n = sprintf(buf, "%s:%d: ", funcname, linenr);

if (dev) {
handler = tcmu_dev_get_handler(dev);
n = sprintf(buf, "%s:%d %s/%s: ", funcname, linenr,
handler ? handler->subtype: "",
dev ? dev->tcm_dev_name: "");
} else {
n = sprintf(buf, "%s:%d: ", funcname, linenr);
n += sprintf(buf + n, "%s/%s: ",
handler ? handler->subtype: "",
dev ? dev->tcm_dev_name: "");
}

vsnprintf(buf + n, LOG_MSG_LEN - n, fmt, args);
Expand Down Expand Up @@ -514,6 +515,34 @@ static bool log_dequeue_msg(struct log_buf *logbuf)
return true;
}

pthread_cond_t pending_cmds_cond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t pending_cmds_lock = PTHREAD_MUTEX_INITIALIZER;
struct list_head pending_cmds_head = LIST_HEAD_INIT(pending_cmds_head);
static pthread_t pending_thread_id;
static void *log_thread_pending_start(void *arg)
{
struct tcmu_device *dev, *tmp;

while (1) {
pthread_mutex_lock(&pending_cmds_lock);
pthread_cond_wait(&pending_cmds_cond, &pending_cmds_lock);
list_for_each_safe(&pending_cmds_head, dev, tmp, entry) {
tcmu_dev_warn_clean(dev, "Pending cmds: 30(+)s:[%lu],"
" 25s:[%lu], 20s:[%lu], 15s:[%lu], 10s:[%lu],"
" 5s:[%lu]\n",
dev->timeout_cmds[CMD_TO_30SEC / CMD_TO_STEP - 1],
dev->timeout_cmds[CMD_TO_25SEC / CMD_TO_STEP - 1],
dev->timeout_cmds[CMD_TO_20SEC / CMD_TO_STEP - 1],
dev->timeout_cmds[CMD_TO_15SEC / CMD_TO_STEP - 1],
dev->timeout_cmds[CMD_TO_10SEC / CMD_TO_STEP - 1],
dev->timeout_cmds[CMD_TO_5SEC / CMD_TO_STEP - 1]);
}
pthread_mutex_unlock(&pending_cmds_lock);
}

return NULL;
}

static void *log_thread_start(void *arg)
{
tcmu_logbuf = arg;
Expand Down Expand Up @@ -691,6 +720,13 @@ int tcmu_setup_log(char *log_dir)
return ret;
}

ret = pthread_create(&pending_thread_id, NULL, log_thread_pending_start,
NULL);
if (ret) {
pthread_cancel(logbuf->thread_id);
return ret;
}

return 0;

free_log_dir:
Expand Down
4 changes: 4 additions & 0 deletions libtcmu_log.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
#define TCMU_LOG_DIR_DEFAULT "/var/log"

struct tcmu_device;
extern struct list_head pending_cmds_head;
extern pthread_cond_t pending_cmds_cond;
extern pthread_mutex_t pending_cmds_lock;

void tcmu_set_log_level(int level);
unsigned int tcmu_get_log_level(void);
Expand All @@ -49,6 +52,7 @@ void tcmu_dbg_scsi_cmd_message(struct tcmu_device *dev, const char *funcname, in
#define tcmu_dev_crit(dev, ...) do { tcmu_crit_message(dev, __func__, __LINE__, __VA_ARGS__);} while (0)
#define tcmu_dev_err(dev, ...) do { tcmu_err_message(dev, __func__, __LINE__, __VA_ARGS__);} while (0)
#define tcmu_dev_warn(dev, ...) do { tcmu_warn_message(dev, __func__, __LINE__, __VA_ARGS__);} while (0)
#define tcmu_dev_warn_clean(dev, ...) do { tcmu_warn_message(dev, NULL, 0, __VA_ARGS__);} while (0)
#define tcmu_dev_info(dev, ...) do { tcmu_info_message(dev, __func__, __LINE__, __VA_ARGS__);} while (0)
#define tcmu_dev_dbg(dev, ...) do { tcmu_dbg_message(dev, __func__, __LINE__, __VA_ARGS__);} while (0)
#define tcmu_dev_dbg_scsi_cmd(dev, ...) do { tcmu_dbg_scsi_cmd_message(dev, __func__, __LINE__, __VA_ARGS__);} while (0)
Expand Down
13 changes: 13 additions & 0 deletions libtcmu_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <pthread.h>

#include "darray.h"
#include "ccan/list/list.h"

#define KERN_IFACE_VER 2

Expand All @@ -35,6 +36,15 @@ struct tcmulib_context {
GDBusConnection *connection;
};

#define CMD_TO_5SEC 5
#define CMD_TO_10SEC 10
#define CMD_TO_15SEC 15
#define CMD_TO_20SEC 20
#define CMD_TO_25SEC 25
#define CMD_TO_30SEC 30
#define CMD_TO_STEP 5
#define CMD_TO_COUNT 6

struct tcmu_device {
int fd;

Expand Down Expand Up @@ -63,6 +73,9 @@ struct tcmu_device {
struct tcmulib_handler *handler;
struct tcmulib_context *ctx;

uint64_t timeout_cmds[CMD_TO_COUNT];
struct list_node entry;

void *hm_private; /* private ptr for handler module */
};

Expand Down
9 changes: 9 additions & 0 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "version.h"
#include "libtcmu_config.h"
#include "libtcmu_log.h"
#include "libtcmu_timer.h"

# define TCMU_LOCK_FILE "/var/run/lock/tcmu.lock"

Expand Down Expand Up @@ -1022,6 +1023,12 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}

ret = tcmu_init_timer_base();
if (ret) {
tcmu_err("failed to init tcmu timer base!\n");
exit(1);
}

while (1) {
int option_index = 0;
int c, nr_files;
Expand Down Expand Up @@ -1234,6 +1241,8 @@ int main(int argc, char **argv)
if (new_path)
free(handler_path);

tcmu_cleanup_timer_base();

if (ret)
exit(EXIT_FAILURE);

Expand Down
48 changes: 45 additions & 3 deletions tcmur_cmd_handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "libtcmu_log.h"
#include "libtcmu_priv.h"
#include "libtcmu_common.h"
#include "libtcmu_timer.h"
#include "tcmur_aio.h"
#include "tcmur_device.h"
#include "tcmu-runner.h"
Expand All @@ -39,10 +40,11 @@ void tcmur_command_complete(struct tcmu_device *dev, struct tcmulib_cmd *cmd,
struct tcmur_device *rdev = tcmu_dev_get_private(dev);

pthread_cleanup_push(_cleanup_spin_lock, (void *)&rdev->lock);
pthread_spin_lock(&rdev->lock);
if (cmd->timer)
tcmu_del_timer(cmd->timer);

pthread_spin_lock(&rdev->lock);
tcmulib_command_complete(dev, cmd, rc);

pthread_spin_unlock(&rdev->lock);
pthread_cleanup_pop(0);
}
Expand All @@ -61,6 +63,29 @@ static void aio_command_finish(struct tcmu_device *dev, struct tcmulib_cmd *cmd,
}
}

void tcmur_cmd_timeout(struct tcmu_timer *timer, void *data)
{
struct tcmulib_cmd *cmd = (struct tcmulib_cmd *)data;
struct tcmu_device *dev = cmd->dev;
struct tcmur_device *rdev = tcmu_dev_get_private(dev);

if (cmd->timeout >= CMD_TIMEOUT_30SEC)
return;

pthread_spin_lock(&rdev->lock);
if (!cmd->timeout) {
cmd->timeout = CMD_TIMEOUT_5SEC;
} else {
dev->timeout_cmds[cmd->timeout / CMD_TIMEOUT_STEP - 1]--;
cmd->timeout += CMD_TIMEOUT_STEP;
}
dev->timeout_cmds[cmd->timeout / CMD_TIMEOUT_STEP - 1]++;
pthread_spin_unlock(&rdev->lock);

tcmu_mod_timer(timer, CMD_TIMEOUT_STEP);
pthread_cond_signal(&pending_cmds_cond);
}

static int alloc_iovec(struct tcmulib_cmd *cmd, size_t length)
{
struct iovec *iov;
Expand Down Expand Up @@ -2239,6 +2264,8 @@ static int tcmur_cmd_handler(struct tcmu_device *dev, struct tcmulib_cmd *cmd)
struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
struct tcmur_device *rdev = tcmu_dev_get_private(dev);
uint8_t *cdb = cmd->cdb;
struct tcmu_timer *timer;


track_aio_request_start(rdev);

Expand Down Expand Up @@ -2275,6 +2302,18 @@ static int tcmur_cmd_handler(struct tcmu_device *dev, struct tcmulib_cmd *cmd)
break;
}

timer = malloc(sizeof(*timer));
if (!timer) {
tcmu_dev_err(dev, "malloc timer failed!\n");
goto untrack;
}

timer->data = cmd;
timer->expires = CMD_TIMEOUT_5SEC;
timer->function = tcmur_cmd_timeout;
cmd->timer = timer;
tcmu_add_timer(timer);

switch(cdb[0]) {
case READ_6:
case READ_10:
Expand Down Expand Up @@ -2318,8 +2357,11 @@ static int tcmur_cmd_handler(struct tcmu_device *dev, struct tcmulib_cmd *cmd)
}

untrack:
if (ret != TCMU_STS_ASYNC_HANDLED)
if (ret != TCMU_STS_ASYNC_HANDLED) {
if (timer)
tcmu_del_timer(timer);
track_aio_request_finish(rdev, NULL);
}
return ret;
}

Expand Down
1 change: 1 addition & 0 deletions tcmur_cmd_handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@ typedef int (*tcmur_caw_fn_t)(struct tcmu_device *dev, struct tcmulib_cmd *cmd,
size_t iov_cnt);
int tcmur_handle_caw(struct tcmu_device *dev, struct tcmulib_cmd *cmd,
tcmur_caw_fn_t caw_fn);
void tcmur_cmd_timeout(struct tcmu_timer *timer, void *data);

#endif /* __TCMUR_CMD_HANDLER_H */

0 comments on commit cf50d18

Please sign in to comment.