From 331b954354e180e599ba75a858d15b57b8ee07ef Mon Sep 17 00:00:00 2001 From: MatthewVon Date: Thu, 11 Jul 2013 18:08:05 -0400 Subject: [PATCH 1/9] clean up fadvise use on WritableFile. clean up foreground/background selection of operations in PosixMmapFile. --- include/leveldb/env.h | 19 ++++++++ table/table_builder.cc | 3 ++ util/env_posix.cc | 98 +++++++++++++++++++++++++++++++++--------- 3 files changed, 100 insertions(+), 20 deletions(-) diff --git a/include/leveldb/env.h b/include/leveldb/env.h index ba8374e5..fc21cff6 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -71,6 +71,7 @@ class Env { virtual Status NewWritableFile(const std::string& fname, WritableFile** result) = 0; + // Riak specific: // Derived from NewWritableFile. One change: if the file exists, // move to the end of the file and continue writing. // new file. On success, stores a pointer to the open file in @@ -81,6 +82,16 @@ class Env { virtual Status NewAppendableFile(const std::string& fname, WritableFile** result) = 0; + // Riak specific: + // Derived from NewWritableFile. Sets flag that changes + // code paths related to unmap/close and the use of + // background threads. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSstFile(const std::string& fname, + WritableFile** result) + {return(NewWritableFile(fname, result));}; + // Returns true iff the named file exists. virtual bool FileExists(const std::string& fname) = 0; @@ -233,6 +244,11 @@ class WritableFile { virtual Status Flush() = 0; virtual Status Sync() = 0; + // Riak specific: + // Provide hint where key/value data ends and metadata starts + // in an .sst table file. + virtual void SetMetadataOffset(uint64_t) {}; + private: // No copying allowed WritableFile(const WritableFile&); @@ -318,6 +334,9 @@ class EnvWrapper : public Env { Status NewAppendableFile(const std::string& f, WritableFile** r) { return target_->NewAppendableFile(f, r); } + Status NewSstFile(const std::string& f, WritableFile** r) { + return target_->NewSstFile(f, r); + } bool FileExists(const std::string& f) { return target_->FileExists(f); } Status GetChildren(const std::string& dir, std::vector* r) { return target_->GetChildren(dir, r); diff --git a/table/table_builder.cc b/table/table_builder.cc index e84c6ab9..a5c3a529 100644 --- a/table/table_builder.cc +++ b/table/table_builder.cc @@ -228,6 +228,9 @@ Status TableBuilder::Finish() { BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle, sst_stats_block_handle; + // pass hint to Linux fadvise management + r->file->SetMetadataOffset(r->offset); + // Write filter block if (ok() && r->filter_block != NULL) { WriteRawBlock(r->filter_block->Finish(), kNoCompression, diff --git a/util/env_posix.cc b/util/env_posix.cc index d4803d08..c6e99acd 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -60,9 +60,13 @@ struct BGCloseInfo size_t offset_; size_t length_; size_t unused_; + uint64_t metadata_; - BGCloseInfo(int fd, void * base, size_t offset, size_t length, size_t unused) - : fd_(fd), base_(base), offset_(offset), length_(length), unused_(unused) {}; + BGCloseInfo(int fd, void * base, size_t offset, size_t length, + size_t unused, uint64_t metadata) + : fd_(fd), base_(base), offset_(offset), length_(length), + unused_(unused), metadata_(metadata) + {}; }; class PosixSequentialFile: public SequentialFile { @@ -163,7 +167,7 @@ class PosixMmapReadableFile: public RandomAccessFile { } virtual ~PosixMmapReadableFile() { - BGCloseInfo * ptr=new BGCloseInfo(fd_, mmapped_region_, 0, length_, 0); + BGCloseInfo * ptr=new BGCloseInfo(fd_, mmapped_region_, 0, length_, 0, 0); Env::Default()->Schedule(&BGFileCloser, ptr, 4); }; @@ -195,9 +199,9 @@ class PosixMmapFile : public WritableFile { char* dst_; // Where to write next (in range [base_,limit_]) char* last_sync_; // Where have we synced up to uint64_t file_offset_; // Offset of base_ in file - - // Have we done an munmap of unsynced data? - bool pending_sync_; + uint64_t metadata_offset_; // Offset where sst metadata starts, or zero + bool pending_sync_; // Have we done an munmap of unsynced data? + bool is_sst_file_; // Is this an sst file be created in background? // Roundup x to a multiple of y static size_t Roundup(size_t x, size_t y) { @@ -218,19 +222,32 @@ class PosixMmapFile : public WritableFile { pending_sync_ = true; } - BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_, limit_-dst_); - if (and_close) + BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_, + limit_-dst_, metadata_offset_); + + // sst_file is on background compaction thread, keep all operations + // in sequence + if (is_sst_file_) { - // do this in foreground unfortunately, bug where file not - // closed fast enough for reopen - BGFileCloser2(ptr); - fd_=-1; + if (and_close) + BGFileCloser2(ptr); + else + BGFileUnmapper2(ptr); } // if + + // called from user thread, move these operations to background + // queue else { - Env::Default()->Schedule(&BGFileUnmapper2, ptr, 4); + if (and_close) + Env::Default()->Schedule(&BGFileCloser2, ptr, 4); + else + Env::Default()->Schedule(&BGFileUnmapper2, ptr, 4); } // else + if (and_close) + fd_=-1; + file_offset_ += limit_ - base_; base_ = NULL; limit_ = NULL; @@ -277,7 +294,8 @@ class PosixMmapFile : public WritableFile { public: PosixMmapFile(const std::string& fname, int fd, - size_t page_size, size_t file_offset=0L) + size_t page_size, size_t file_offset=0L, + bool is_sst=false) : filename_(fname), fd_(fd), page_size_(page_size), @@ -287,13 +305,14 @@ class PosixMmapFile : public WritableFile { dst_(NULL), last_sync_(NULL), file_offset_(file_offset), - pending_sync_(false) { + metadata_offset_(0), + pending_sync_(false), + is_sst_file_(is_sst) { assert((page_size & (page_size - 1)) == 0); gPerfCounters->Inc(ePerfRWFileOpen); } - ~PosixMmapFile() { if (fd_ >= 0) { PosixMmapFile::Close(); @@ -364,6 +383,11 @@ class PosixMmapFile : public WritableFile { return s; } + + virtual void SetMetadataOffset(uint64_t Metadata) + { + metadata_offset_=Metadata; + } // SetMetadataOffset }; @@ -480,7 +504,7 @@ class PosixEnv : public Env { *result = NULL; s = IOError(fname, errno); } else { - *result = new PosixMmapFile(fname, fd, page_size_); + *result = new PosixMmapFile(fname, fd, page_size_, 0, false); } return s; } @@ -509,6 +533,18 @@ class PosixEnv : public Env { return s; } + virtual Status NewSstFile(const std::string& fname, + WritableFile** result) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + *result = NULL; + s = IOError(fname, errno); + } else { + *result = new PosixMmapFile(fname, fd, page_size_, 0, true); + } + return s; + } virtual bool FileExists(const std::string& fname) { @@ -1029,17 +1065,29 @@ void BGFileCloser(void * arg) void BGFileCloser2(void * arg) { BGCloseInfo * file_ptr; + int ret_val; file_ptr=(BGCloseInfo *)arg; munmap(file_ptr->base_, file_ptr->length_); #if defined(HAVE_FADVISE) - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + // release newly written data from Linux page cache if possible + if (0==file_ptr->metadata + || (file_ptr->offset_t + file_ptr->length_ < file_ptr->metadata)) + { + // must fdatasync for DONTNEED to work + fdatasync(file_ptr->fd_); + posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + } // if + else + { + posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + } // else #endif if (0 != file_ptr->unused_) - ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); + ret_val=ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); close(file_ptr->fd_); delete file_ptr; @@ -1082,7 +1130,17 @@ void BGFileUnmapper2(void * arg) munmap(file_ptr->base_, file_ptr->length_); #if defined(HAVE_FADVISE) - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + if (0==file_ptr->metadata + || (file_ptr->offset_t + file_ptr->length_ < file_ptr->metadata)) + { + // must fdatasync for DONTNEED to work + fdatasync(file_ptr->fd_); + posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + } // if + else + { + posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + } // else #endif delete file_ptr; From 242f1cdb6daf3e5e097af198a605a495802d6528 Mon Sep 17 00:00:00 2001 From: MatthewVon Date: Thu, 11 Jul 2013 18:17:23 -0400 Subject: [PATCH 2/9] correct syntax errors hidden on Mac OSX --- util/env_posix.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/util/env_posix.cc b/util/env_posix.cc index c6e99acd..a6e09398 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -1073,8 +1073,8 @@ void BGFileCloser2(void * arg) #if defined(HAVE_FADVISE) // release newly written data from Linux page cache if possible - if (0==file_ptr->metadata - || (file_ptr->offset_t + file_ptr->length_ < file_ptr->metadata)) + if (0==file_ptr->metadata_ + || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) { // must fdatasync for DONTNEED to work fdatasync(file_ptr->fd_); @@ -1130,8 +1130,8 @@ void BGFileUnmapper2(void * arg) munmap(file_ptr->base_, file_ptr->length_); #if defined(HAVE_FADVISE) - if (0==file_ptr->metadata - || (file_ptr->offset_t + file_ptr->length_ < file_ptr->metadata)) + if (0==file_ptr->metadata_ + || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) { // must fdatasync for DONTNEED to work fdatasync(file_ptr->fd_); From 45417c17eaeea8167b974d18eeef9fa2646a84e9 Mon Sep 17 00:00:00 2001 From: MatthewVon Date: Fri, 12 Jul 2013 12:54:02 -0400 Subject: [PATCH 3/9] prepare in-active code at key spots while looking around --- util/env_posix.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/util/env_posix.cc b/util/env_posix.cc index a6e09398..ad987008 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -112,7 +112,12 @@ class PosixRandomAccessFile: public RandomAccessFile { public: PosixRandomAccessFile(const std::string& fname, int fd) - : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0) { } + : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0) + { +#if defined(HAVE_FADVISE) + // posix_fadvise(fd_, 0, file_size_, POSIX_FADV_RANDOM); +#endif + } virtual ~PosixRandomAccessFile() { if (is_compaction_) @@ -141,6 +146,9 @@ class PosixRandomAccessFile: public RandomAccessFile { { is_compaction_=true; file_size_=file_size; +#if defined(HAVE_FADVISE) +// posix_fadvise(fd_, 0, file_size_, POSIX_FADV_SEQUENTIAL); +#endif }; From 752236b277d1531303f84eebdfd040475fbb84c4 Mon Sep 17 00:00:00 2001 From: Matthew V Date: Fri, 12 Jul 2013 19:40:01 -0400 Subject: [PATCH 4/9] Reverse the pattern. Default is same thread like .sst. Setup recovery log and others to use background. --- include/leveldb/env.h | 6 +++--- util/env_posix.cc | 28 ++++++++++++++-------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/leveldb/env.h b/include/leveldb/env.h index fc21cff6..e1d8e7a6 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -88,7 +88,7 @@ class Env { // background threads. // // The returned file will only be accessed by one thread at a time. - virtual Status NewSstFile(const std::string& fname, + virtual Status NewWriteOnlyFile(const std::string& fname, WritableFile** result) {return(NewWritableFile(fname, result));}; @@ -334,8 +334,8 @@ class EnvWrapper : public Env { Status NewAppendableFile(const std::string& f, WritableFile** r) { return target_->NewAppendableFile(f, r); } - Status NewSstFile(const std::string& f, WritableFile** r) { - return target_->NewSstFile(f, r); + Status NewWriteOnlyFile(const std::string& f, WritableFile** r) { + return target_->NewWriteOnlyFile(f, r); } bool FileExists(const std::string& f) { return target_->FileExists(f); } Status GetChildren(const std::string& dir, std::vector* r) { diff --git a/util/env_posix.cc b/util/env_posix.cc index ad987008..5ff76abb 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -62,9 +62,9 @@ struct BGCloseInfo size_t unused_; uint64_t metadata_; - BGCloseInfo(int fd, void * base, size_t offset, size_t length, + BGCloseInfo(int fd, void * base, size_t offset, size_t length, size_t unused, uint64_t metadata) - : fd_(fd), base_(base), offset_(offset), length_(length), + : fd_(fd), base_(base), offset_(offset), length_(length), unused_(unused), metadata_(metadata) {}; }; @@ -112,8 +112,8 @@ class PosixRandomAccessFile: public RandomAccessFile { public: PosixRandomAccessFile(const std::string& fname, int fd) - : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0) - { + : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0) + { #if defined(HAVE_FADVISE) // posix_fadvise(fd_, 0, file_size_, POSIX_FADV_RANDOM); #endif @@ -209,7 +209,7 @@ class PosixMmapFile : public WritableFile { uint64_t file_offset_; // Offset of base_ in file uint64_t metadata_offset_; // Offset where sst metadata starts, or zero bool pending_sync_; // Have we done an munmap of unsynced data? - bool is_sst_file_; // Is this an sst file be created in background? + bool is_write_only_; // can this file process in background // Roundup x to a multiple of y static size_t Roundup(size_t x, size_t y) { @@ -230,12 +230,12 @@ class PosixMmapFile : public WritableFile { pending_sync_ = true; } - BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_, + BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_, limit_-dst_, metadata_offset_); - // sst_file is on background compaction thread, keep all operations - // in sequence - if (is_sst_file_) + // write only files can perform operations async, but not + // files that might re-open and read again soon + if (!is_write_only_) { if (and_close) BGFileCloser2(ptr); @@ -303,7 +303,7 @@ class PosixMmapFile : public WritableFile { public: PosixMmapFile(const std::string& fname, int fd, size_t page_size, size_t file_offset=0L, - bool is_sst=false) + bool is_write_only=false) : filename_(fname), fd_(fd), page_size_(page_size), @@ -315,7 +315,7 @@ class PosixMmapFile : public WritableFile { file_offset_(file_offset), metadata_offset_(0), pending_sync_(false), - is_sst_file_(is_sst) { + is_write_only_(is_write_only) { assert((page_size & (page_size - 1)) == 0); gPerfCounters->Inc(ePerfRWFileOpen); @@ -541,7 +541,7 @@ class PosixEnv : public Env { return s; } - virtual Status NewSstFile(const std::string& fname, + virtual Status NewWriteOnlyFile(const std::string& fname, WritableFile** result) { Status s; const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); @@ -1081,7 +1081,7 @@ void BGFileCloser2(void * arg) #if defined(HAVE_FADVISE) // release newly written data from Linux page cache if possible - if (0==file_ptr->metadata_ + if (0==file_ptr->metadata_ || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) { // must fdatasync for DONTNEED to work @@ -1138,7 +1138,7 @@ void BGFileUnmapper2(void * arg) munmap(file_ptr->base_, file_ptr->length_); #if defined(HAVE_FADVISE) - if (0==file_ptr->metadata_ + if (0==file_ptr->metadata_ || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) { // must fdatasync for DONTNEED to work From d2572f3e7f86b75435f6dd7e8dd58f9c332d68b4 Mon Sep 17 00:00:00 2001 From: Matthew V Date: Fri, 12 Jul 2013 19:44:53 -0400 Subject: [PATCH 5/9] explicitly put recovery log into background write/close loop --- db/db_impl.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 2667945b..b1dc6f19 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1673,7 +1673,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { uint64_t new_log_number = versions_->NewFileNumber(); WritableFile* lfile = NULL; gPerfCounters->Inc(ePerfWriteNewMem); - s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); + s = env_->NewWriteOnlyFile(LogFileName(dbname_, new_log_number), &lfile); if (!s.ok()) { // Avoid chewing through file number space in a tight loop. versions_->ReuseFileNumber(new_log_number); @@ -1820,7 +1820,7 @@ Status DB::Open(const Options& options, const std::string& dbname, if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); WritableFile* lfile; - s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), + s = options.env->NewWriteOnlyFile(LogFileName(dbname, new_log_number), &lfile); if (s.ok()) { edit.SetLogNumber(new_log_number); From b88d9c4b51e6454693e1028a5afe5aa8c878d239 Mon Sep 17 00:00:00 2001 From: MatthewVon Date: Tue, 16 Jul 2013 15:28:20 -0400 Subject: [PATCH 6/9] Clear gcc variable not used warning, but keep call to h->key() incase the call creates side effects. --- util/cache.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/cache.cc b/util/cache.cc index 2c1459ed..0e5f5d4c 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -116,7 +116,7 @@ class HandleTable { LRUHandle* h = list_[i]; while (h != NULL) { LRUHandle* next = h->next_hash; - Slice key = h->key(); + /*Slice key =*/ h->key(); // eliminate unused var warning, but allow for side-effects uint32_t hash = h->hash; LRUHandle** ptr = &new_list[hash & (new_length - 1)]; h->next_hash = *ptr; From 1df84ada20acf27498896ae78b5fcc1690eac331 Mon Sep 17 00:00:00 2001 From: MatthewVon Date: Wed, 17 Jul 2013 13:11:14 -0400 Subject: [PATCH 7/9] Add fallback error reporting to syslog and to new perf_counter. Enable SetForCompaction() in this code base (though not for an already open file). --- db/table_cache.cc | 3 +- include/leveldb/perf_count.h | 1 + util/env_posix.cc | 111 +++++++++++++++++++++++++++++++---- util/perf_count.cc | 3 +- 4 files changed, 103 insertions(+), 15 deletions(-) diff --git a/db/table_cache.cc b/db/table_cache.cc index 8aeafbc1..1c63b414 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -88,6 +88,7 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level } else { + // (later, call SetForCompaction here too for files already in cache) gPerfCounters->Inc(ePerfTableCached); } // else return s; @@ -103,7 +104,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, } Cache::Handle* handle = NULL; - Status s = FindTable(file_number, file_size, level, &handle); + Status s = FindTable(file_number, file_size, level, &handle, options.IsCompaction()); if (!s.ok()) { return NewErrorIterator(s); } diff --git a/include/leveldb/perf_count.h b/include/leveldb/perf_count.h index c05a76ec..3d37eb31 100644 --- a/include/leveldb/perf_count.h +++ b/include/leveldb/perf_count.h @@ -189,6 +189,7 @@ enum PerformanceCountersEnum ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+) ePerfThrottleCompacts1=65,//!< number of level 1+ compactions + ePerfBGWriteError=66, //!< error in write/close, see syslog // must follow last index name to represent size of array // (ASSUMES previous enum is highest value) diff --git a/util/env_posix.cc b/util/env_posix.cc index 5ff76abb..ae7aa59a 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -147,7 +148,7 @@ class PosixRandomAccessFile: public RandomAccessFile { is_compaction_=true; file_size_=file_size; #if defined(HAVE_FADVISE) -// posix_fadvise(fd_, 0, file_size_, POSIX_FADV_SEQUENTIAL); + posix_fadvise(fd_, 0, file_size_, POSIX_FADV_SEQUENTIAL); #endif }; @@ -1049,22 +1050,46 @@ void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { void BGFileCloser(void * arg) { BGCloseInfo * file_ptr; + bool err_flag; + int ret_val; + err_flag=false; file_ptr=(BGCloseInfo *)arg; - munmap(file_ptr->base_, file_ptr->length_); + ret_val=munmap(file_ptr->base_, file_ptr->length_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser munmap failed [%d, %m]", errno); + err_flag=true; + } // if #if defined(HAVE_FADVISE) - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser posix_fadvise DONTNEED failed [%d, %m]", errno); + err_flag=true; + } // if + #endif if (0 != file_ptr->unused_) - ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); + { + ret_val=ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser ftruncate failed [%d, %m]", errno); + err_flag=true; + } // if + } close(file_ptr->fd_); delete file_ptr; gPerfCounters->Inc(ePerfROFileClose); + if (err_flag) + gPerfCounters->Inc(ePerfBGWriteError); + return; } // BGFileCloser @@ -1073,11 +1098,19 @@ void BGFileCloser(void * arg) void BGFileCloser2(void * arg) { BGCloseInfo * file_ptr; + bool err_flag; int ret_val; + err_flag=false; file_ptr=(BGCloseInfo *)arg; - munmap(file_ptr->base_, file_ptr->length_); + ret_val=munmap(file_ptr->base_, file_ptr->length_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 munmap failed [%d, %m]", errno); + err_flag=true; + } // if + #if defined(HAVE_FADVISE) // release newly written data from Linux page cache if possible @@ -1085,23 +1118,48 @@ void BGFileCloser2(void * arg) || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) { // must fdatasync for DONTNEED to work - fdatasync(file_ptr->fd_); - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + ret_val=fdatasync(file_ptr->fd_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 fdatasync failed [%d, %m]", errno); + err_flag=true; + } // if + + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 posix_fadvise DONTNEED failed [%d, %m]", errno); + err_flag=true; + } // if } // if else { - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 posix_fadvise WILLNEED failed [%d, %m]", errno); + err_flag=true; + } // if } // else #endif if (0 != file_ptr->unused_) + { ret_val=ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); - + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 ftruncate failed [%d, %m]", errno); + err_flag=true; + } // if + } close(file_ptr->fd_); delete file_ptr; gPerfCounters->Inc(ePerfRWFileClose); + if (err_flag) + gPerfCounters->Inc(ePerfBGWriteError); + return; } // BGFileCloser2 @@ -1132,28 +1190,55 @@ void BGFileUnmapper(void * arg) void BGFileUnmapper2(void * arg) { BGCloseInfo * file_ptr; + bool err_flag; + int ret_val; + err_flag=false; file_ptr=(BGCloseInfo *)arg; - munmap(file_ptr->base_, file_ptr->length_); + ret_val=munmap(file_ptr->base_, file_ptr->length_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 munmap failed [%d, %m]", errno); + err_flag=true; + } // if #if defined(HAVE_FADVISE) if (0==file_ptr->metadata_ || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) { // must fdatasync for DONTNEED to work - fdatasync(file_ptr->fd_); - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + ret_val=fdatasync(file_ptr->fd_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 fdatasync failed [%d, %m]", errno); + err_flag=true; + } // if + + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 posix_fadvise DONTNEED failed [%d, %m]", errno); + err_flag=true; + } // if } // if else { - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 posix_fadvise WILLNEED failed [%d, %m]", errno); + err_flag=true; + } // if } // else #endif delete file_ptr; gPerfCounters->Inc(ePerfRWFileUnmap); + if (err_flag) + gPerfCounters->Inc(ePerfBGWriteError); + return; } // BGFileUnmapper2 diff --git a/util/perf_count.cc b/util/perf_count.cc index 91e29924..2aa86c15 100644 --- a/util/perf_count.cc +++ b/util/perf_count.cc @@ -514,7 +514,8 @@ PerformanceCounters * gPerfCounters(&LocalStartupCounters); "ThrottleMicros1", "ThrottleKeys1", "ThrottleBacklog1", - "ThrottleCompacts1" + "ThrottleCompacts1", + "BGWriteError" }; From 7bb846b29c2e6cb362345454137c58f006c80296 Mon Sep 17 00:00:00 2001 From: MatthewVon Date: Thu, 18 Jul 2013 16:19:02 -0400 Subject: [PATCH 8/9] Update comments per code review. --- include/leveldb/env.h | 6 +++--- util/env_posix.cc | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/leveldb/env.h b/include/leveldb/env.h index e1d8e7a6..add51a5d 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -83,9 +83,9 @@ class Env { WritableFile** result) = 0; // Riak specific: - // Derived from NewWritableFile. Sets flag that changes - // code paths related to unmap/close and the use of - // background threads. + // Derived from NewWritableFile. Special version of + // NewWritableFile that enables write and close operations + // to execute on background threads (where supported). // // The returned file will only be accessed by one thread at a time. virtual Status NewWriteOnlyFile(const std::string& fname, diff --git a/util/env_posix.cc b/util/env_posix.cc index ae7aa59a..791272a6 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -116,6 +116,9 @@ class PosixRandomAccessFile: public RandomAccessFile { : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0) { #if defined(HAVE_FADVISE) + // Currently hurts performance instead of helps. Likely + // requires better interaction with tables already in cache + // that start compaction. See comment in table_cache.cc. // posix_fadvise(fd_, 0, file_size_, POSIX_FADV_RANDOM); #endif } From cc0ff65c6d90f79487d7a185d476cc72edcbe3e2 Mon Sep 17 00:00:00 2001 From: MatthewVon Date: Mon, 22 Jul 2013 12:10:32 -0400 Subject: [PATCH 9/9] update comments and whitespace issues per code review. --- include/leveldb/env.h | 6 +++--- util/env_posix.cc | 50 +++++++++++++++++++++---------------------- util/perf_count.cc | 2 +- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/include/leveldb/env.h b/include/leveldb/env.h index add51a5d..95f59108 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -83,9 +83,9 @@ class Env { WritableFile** result) = 0; // Riak specific: - // Derived from NewWritableFile. Special version of - // NewWritableFile that enables write and close operations - // to execute on background threads (where supported). + // Allows for virtualized version of NewWritableFile that enables write + // and close operations to execute on background threads + // (where platform supported). // // The returned file will only be accessed by one thread at a time. virtual Status NewWriteOnlyFile(const std::string& fname, diff --git a/util/env_posix.cc b/util/env_posix.cc index 791272a6..59c9d063 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -1079,11 +1079,11 @@ void BGFileCloser(void * arg) if (0 != file_ptr->unused_) { ret_val=ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); - if (0!=ret_val) - { - syslog(LOG_ERR,"BGFileCloser ftruncate failed [%d, %m]", errno); - err_flag=true; - } // if + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser ftruncate failed [%d, %m]", errno); + err_flag=true; + } // if } close(file_ptr->fd_); @@ -1122,38 +1122,38 @@ void BGFileCloser2(void * arg) { // must fdatasync for DONTNEED to work ret_val=fdatasync(file_ptr->fd_); - if (0!=ret_val) - { - syslog(LOG_ERR,"BGFileCloser2 fdatasync failed [%d, %m]", errno); - err_flag=true; - } // if + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 fdatasync failed [%d, %m]", errno); + err_flag=true; + } // if ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); - if (0!=ret_val) - { - syslog(LOG_ERR,"BGFileCloser2 posix_fadvise DONTNEED failed [%d, %m]", errno); - err_flag=true; - } // if + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 posix_fadvise DONTNEED failed [%d, %m]", errno); + err_flag=true; + } // if } // if else { ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); - if (0!=ret_val) - { - syslog(LOG_ERR,"BGFileCloser2 posix_fadvise WILLNEED failed [%d, %m]", errno); - err_flag=true; - } // if + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 posix_fadvise WILLNEED failed [%d, %m]", errno); + err_flag=true; + } // if } // else #endif if (0 != file_ptr->unused_) { ret_val=ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); - if (0!=ret_val) - { - syslog(LOG_ERR,"BGFileCloser2 ftruncate failed [%d, %m]", errno); - err_flag=true; - } // if + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 ftruncate failed [%d, %m]", errno); + err_flag=true; + } // if } close(file_ptr->fd_); delete file_ptr; diff --git a/util/perf_count.cc b/util/perf_count.cc index 2aa86c15..7aee5d99 100644 --- a/util/perf_count.cc +++ b/util/perf_count.cc @@ -515,7 +515,7 @@ PerformanceCounters * gPerfCounters(&LocalStartupCounters); "ThrottleKeys1", "ThrottleBacklog1", "ThrottleCompacts1", - "BGWriteError" + "BGWriteError" };