Skip to content

Commit

Permalink
TDB-184 : Broken fsync error handling
Browse files Browse the repository at this point in the history
- Factored out i/o error reporting to be common and try to obtain actual
  filename from descriptor.
- Fixed ENOSPC reporting to use common i/o reporting.
- Fixed fsync retry logic to retry EINTR, EIO, and ENOLCK based on InnoDB code
  and upstream mysql issues 34823 and 90296. Retry will print message every 100
  attempts in a loop.
- Touched up 'slow fsync' tracking and reporting to use new i/o error reporting.
- Fixed close retry logic to be same as fsync.
  • Loading branch information
George O. Lorch III committed Apr 6, 2018
1 parent f02e0f4 commit 4d7237c
Showing 1 changed file with 162 additions and 84 deletions.
246 changes: 162 additions & 84 deletions portability/file.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,67 +72,98 @@ void toku_fs_get_write_info(time_t *enospc_last_time, uint64_t *enospc_current,
*enospc_total = toku_write_enospc_total;
}


void report_io_problem(int fd, const char *fmt, ...) __attribute__((format(printf, 2, 3)));
void report_io_problem(int fd, const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);

const int tstr_length = 26;
char tstr[tstr_length];
time_t t = time(0);
ctime_r(&t, tstr);

char buf[4096];

vsnprintf(buf, sizeof(buf), fmt, ap);

#if __linux__
char fdname[256];
snprintf(fdname, sizeof fdname, "/proc/%d/fd/%d", getpid(), fd);
char lname[256];
ssize_t s = readlink(fdname, lname, sizeof lname);
if (0 < s && s < (ssize_t)sizeof lname)
lname[s] = 0;

fprintf(stderr,
"%.24s : PerconaFT I/O problem, fd[%d], file[%s] : %s\n",
tstr,
fd,
s > 0 ? lname : "?",
buf);
#else
fprintf(stderr,
"%.24s : PerconaFT I/O problem, fd[%d], file[?] : %s\n",
tstr,
fd,
buf);
#endif
fflush(stderr);
}


//Print any necessary errors
//Return whether we should try the write again.
static void
try_again_after_handling_write_error(int fd, size_t len, ssize_t r_write) {
static void try_again_after_handling_write_error(int fd,
size_t len,
ssize_t r_write) {
int try_again = 0;

assert(r_write < 0);
int errno_write = get_error_errno();
switch (errno_write) {
case EINTR: { //The call was interrupted by a signal before any data was written; see signal(7).
char err_msg[sizeof("Write of [] bytes to fd=[] interrupted. Retrying.") + 20+10]; //64 bit is 20 chars, 32 bit is 10 chars
snprintf(err_msg, sizeof(err_msg), "Write of [%" PRIu64 "] bytes to fd=[%d] interrupted. Retrying.", (uint64_t)len, fd);
perror(err_msg);
fflush(stderr);
try_again = 1;
break;
}
case ENOSPC: {
if (toku_assert_on_write_enospc) {
char err_msg[sizeof("Failed write of [] bytes to fd=[].") + 20+10]; //64 bit is 20 chars, 32 bit is 10 chars
snprintf(err_msg, sizeof(err_msg), "Failed write of [%" PRIu64 "] bytes to fd=[%d].", (uint64_t)len, fd);
perror(err_msg);
fflush(stderr);
int out_of_disk_space = 1;
assert(!out_of_disk_space); //Give an error message that might be useful if this is the only one that survives.
} else {
toku_sync_fetch_and_add(&toku_write_enospc_total, 1);
toku_sync_fetch_and_add(&toku_write_enospc_current, 1);

time_t tnow = time(0);
toku_write_enospc_last_time = tnow;
if (toku_write_enospc_last_report == 0 || tnow - toku_write_enospc_last_report >= 60) {
toku_write_enospc_last_report = tnow;

const int tstr_length = 26;
char tstr[tstr_length];
time_t t = time(0);
ctime_r(&t, tstr);

const int MY_MAX_PATH = 256;
char fname[MY_MAX_PATH], symname[MY_MAX_PATH+1];
sprintf(fname, "/proc/%d/fd/%d", getpid(), fd);
ssize_t n = readlink(fname, symname, MY_MAX_PATH);

if ((int)n == -1)
fprintf(stderr, "%.24s PerconaFT No space when writing %" PRIu64 " bytes to fd=%d ", tstr, (uint64_t) len, fd);
else {
tstr[n] = 0; // readlink doesn't append a NUL to the end of the buffer.
fprintf(stderr, "%.24s PerconaFT No space when writing %" PRIu64 " bytes to %*s ", tstr, (uint64_t) len, (int) n, symname);
}
fprintf(stderr, "retry in %d second%s\n", toku_write_enospc_sleep, toku_write_enospc_sleep > 1 ? "s" : "");
fflush(stderr);
}
sleep(toku_write_enospc_sleep);
case EINTR: { // The call was interrupted by a signal before any data
// was written; see signal(7).
report_io_problem(fd,
"Write of [%" PRIu64
"] bytes interrupted. Retrying.",
(uint64_t)len);
try_again = 1;
toku_sync_fetch_and_sub(&toku_write_enospc_current, 1);
break;
}
}
default:
break;
case ENOSPC: {
if (toku_assert_on_write_enospc) {
report_io_problem(
fd, "Failed write of [%" PRIu64 "] bytes.", (uint64_t)len);
int out_of_disk_space = 1;
assert(!out_of_disk_space); // Give an error message that might
// be useful if this is the only one
// that survives.
} else {
toku_sync_fetch_and_add(&toku_write_enospc_total, 1);
toku_sync_fetch_and_add(&toku_write_enospc_current, 1);

time_t tnow = time(0);
toku_write_enospc_last_time = tnow;
if (toku_write_enospc_last_report == 0 ||
tnow - toku_write_enospc_last_report >= 60) {
toku_write_enospc_last_report = tnow;

report_io_problem(fd,
"No space when writing %" PRIu64
" bytes. Retry in %d second%s.",
(uint64_t)len,
toku_write_enospc_sleep,
toku_write_enospc_sleep > 1 ? "s" : "");
}
sleep(toku_write_enospc_sleep);
try_again = 1;
toku_sync_fetch_and_sub(&toku_write_enospc_current, 1);
break;
}
}
default:
break;
}
assert(try_again);
errno = errno_write;
Expand Down Expand Up @@ -597,20 +628,48 @@ int toku_os_close_with_source_location(
uint src_line) { // if EINTR, retry until success
/* register the file close */
int r = -1;
uint64_t retry_count = 0;
bool retry = false;

/* register a file descriptor close " */
toku_io_instrumentation io_annotation;
toku_instr_file_fd_close_begin(
io_annotation, toku_instr_file_op::file_close, fd, src_file, src_line);
while (r != 0) {

do {
r = close(fd);
if (r) {
int rr = errno;
if (rr != EINTR)
printf("rr=%d (%s)\n", rr, strerror(rr));
assert(rr == EINTR);
if (r == -1) {
int what_errno = get_error_errno();
switch (what_errno) {
case ENOLCK:
// fallthrough
case EINTR:
// fallthrough
case EIO:
if (retry_count % 100 == 0) {
report_io_problem(
fd,
"close encoundered errno %d on %" PRIu64 " tries.",
what_errno,
retry_count);
}
retry_count++;
retry = true;
// If we have a debug build, lets go ahead and assert to try
// and get a call stack and some context on where we are
// getting these unusual I/O errors
paranoid_invariant(0);
break;

default:
paranoid_invariant(what_errno == 0);
retry = false;
break;
}
} else {
retry = false;
}
}
} while (retry);

/* Regsiter the returning value with the system */
toku_instr_file_close_end(io_annotation, r);
Expand Down Expand Up @@ -685,7 +744,7 @@ static uint64_t toku_fsync_time;
static uint64_t toku_long_fsync_threshold = 1000000;
static uint64_t toku_long_fsync_count;
static uint64_t toku_long_fsync_time;
static uint64_t toku_long_fsync_eintr_count;
static uint64_t toku_long_fsync_retry_count;
static int toku_fsync_debug = 0;

void toku_set_func_fsync(int (*fsync_function)(int)) {
Expand All @@ -698,7 +757,8 @@ void file_fsync_internal_with_source_location(int fd,
uint src_line) {
uint64_t tstart = toku_current_time_microsec();
int r = -1;
uint64_t eintr_count = 0;
uint64_t retry_count = 0;
bool retry = false;

toku_io_instrumentation io_annotation;
toku_instr_file_io_begin(io_annotation,
Expand All @@ -708,17 +768,46 @@ void file_fsync_internal_with_source_location(int fd,
src_file,
src_line);

while (r != 0) {
do {
if (t_fsync) {
r = t_fsync(fd);
} else {
r = fsync(fd);
r = fsync(fd);
}
if (r) {
assert(get_error_errno() == EINTR);
eintr_count++;
}
}

if (r == -1) {
int what_errno = get_error_errno();
switch (what_errno) {
case ENOLCK:
// fallthrough
case EINTR:
// fallthrough
case EIO:
if (retry_count % 100 == 0) {
report_io_problem(
fd,
"fsync encoundered errno %d on %" PRIu64 " tries.",
what_errno,
retry_count);
}
retry_count++;
retry = true;
// If we have a debug build, lets go ahead and assert to try
// and get a call stack and some context on where we are
// getting these unusual I/O errors
paranoid_invariant(0);
break;

default:
paranoid_invariant(what_errno == 0);
retry = false;
break;
}
} else {
retry = false;
}
} while (retry);

toku_sync_fetch_and_add(&toku_fsync_count, 1);
uint64_t duration = toku_current_time_microsec() - tstart;
toku_sync_fetch_and_add(&toku_fsync_time, duration);
Expand All @@ -728,24 +817,13 @@ void file_fsync_internal_with_source_location(int fd,
if (duration >= toku_long_fsync_threshold) {
toku_sync_fetch_and_add(&toku_long_fsync_count, 1);
toku_sync_fetch_and_add(&toku_long_fsync_time, duration);
toku_sync_fetch_and_add(&toku_long_fsync_eintr_count, eintr_count);
toku_sync_fetch_and_add(&toku_long_fsync_retry_count, retry_count);
if (toku_fsync_debug) {
const int tstr_length = 26;
char tstr[tstr_length];
time_t t = time(0);
#if __linux__
char fdname[256];
snprintf(fdname, sizeof fdname, "/proc/%d/fd/%d", getpid(), fd);
char lname[256];
ssize_t s = readlink(fdname, lname, sizeof lname);
if (0 < s && s < (ssize_t) sizeof lname)
lname[s] = 0;
fprintf(stderr, "%.24s toku_file_fsync %s fd=%d %s duration=%" PRIu64 " usec eintr=%" PRIu64 "\n",
ctime_r(&t, tstr), __FUNCTION__, fd, s > 0 ? lname : "?", duration, eintr_count);
#else
fprintf(stderr, "%.24s toku_file_fsync %s fd=%d duration=%" PRIu64 " usec eintr=%" PRIu64 "\n",
ctime_r(&t, tstr), __FUNCTION__, fd, duration, eintr_count);
#endif
report_io_problem(fd,
"toku_file_fsync took %" PRIu64 " usec with %"
PRIu64 " retries.",
duration,
retry_count);
fflush(stderr);
}
}
Expand Down

0 comments on commit 4d7237c

Please sign in to comment.