From 02f74d4008413201dc2d0fda9c5e0a72d1824f97 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 15 Sep 2016 18:00:10 -0500 Subject: [PATCH 01/16] ensure that docs are shuffled after a call to create_even_split --- include/meta/classify/classifier/classifier.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/meta/classify/classifier/classifier.h b/include/meta/classify/classifier/classifier.h index 23cf63528..17021c4d7 100644 --- a/include/meta/classify/classifier/classifier.h +++ b/include/meta/classify/classifier/classifier.h @@ -89,11 +89,13 @@ confusion_matrix cross_validate(Creator&& creator, bool even_split = false) { using diff_type = decltype(docs.begin())::difference_type; - // docs might be ordered by class, so make sure things are shuffled - docs.shuffle(); + if (even_split) docs = docs.create_even_split(); + // docs might be ordered by class, so make sure things are shuffled + docs.shuffle(); + confusion_matrix matrix; auto step_size = docs.size() / k; for (size_t i = 0; i < k; ++i) From 8247f286fefb0728db128773adf52d1fb7818b23 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 16 Sep 2016 18:37:34 -0500 Subject: [PATCH 02/16] clang-format classify.cpp and remove unneeded using declarations --- src/classify/tools/classify.cpp | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/classify/tools/classify.cpp b/src/classify/tools/classify.cpp index a68383665..97bc66ecf 100644 --- a/src/classify/tools/classify.cpp +++ b/src/classify/tools/classify.cpp @@ -1,5 +1,7 @@ /** - * @file classify-test.cpp + * @file classify.cpp + * @author Sean Massung + * @author Chase Geigle */ #include @@ -17,9 +19,6 @@ #include "meta/util/progress.h" #include "meta/util/time.h" -using std::cout; -using std::cerr; -using std::endl; using namespace meta; template @@ -27,12 +26,10 @@ classify::confusion_matrix cv(Creator&& creator, classify::multiclass_dataset_view docs, bool even) { classify::confusion_matrix matrix; - auto msec = common::time( - [&]() - { - matrix = classify::cross_validate(std::forward(creator), - docs, 5, even); - }); + auto msec = common::time([&]() { + matrix = classify::cross_validate(std::forward(creator), docs, + 5, even); + }); std::cerr << "time elapsed: " << msec.count() / 1000.0 << "s" << std::endl; matrix.print(); matrix.print_stats(); @@ -67,7 +64,7 @@ int main(int argc, char* argv[]) { if (argc != 2) { - cerr << "Usage:\t" << argv[0] << " config.toml" << endl; + std::cerr << "Usage:\t" << argv[0] << " config.toml" << std::endl; return 1; } @@ -81,7 +78,8 @@ int main(int argc, char* argv[]) auto class_config = config->get_table("classifier"); if (!class_config) { - cerr << "Missing classifier configuration group in " << argv[1] << endl; + std::cerr << "Missing classifier configuration group in " << argv[1] + << std::endl; return 1; } @@ -90,14 +88,14 @@ int main(int argc, char* argv[]) classify::multiclass_dataset dataset{f_idx}; std::function( - classify::multiclass_dataset_view)> creator; + classify::multiclass_dataset_view)> + creator; auto classifier_method = *class_config->get_as("method"); auto even = class_config->get_as("even-split").value_or(false); if (classifier_method == "knn" || classifier_method == "nearest-centroid") { auto i_idx = index::make_index(*config); - creator = [=](classify::multiclass_dataset_view fold) - { + creator = [=](classify::multiclass_dataset_view fold) { return classify::make_classifier(*class_config, std::move(fold), i_idx); }; @@ -105,8 +103,7 @@ int main(int argc, char* argv[]) else { - creator = [&](classify::multiclass_dataset_view fold) - { + creator = [&](classify::multiclass_dataset_view fold) { return classify::make_classifier(*class_config, std::move(fold)); }; } From 6bb1b27eff767ad632e5cffda6a6ae2a37f0c5b8 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 16 Sep 2016 21:25:41 -0500 Subject: [PATCH 03/16] make forward indexer listen to indexer-num-threads option --- src/index/forward_index.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 6ffdef756..17fab146d 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -53,7 +53,7 @@ class forward_index::impl * merged. */ void tokenize_docs(corpus::corpus& corpus, metadata_writer& mdata_writer, - uint64_t ram_budget); + uint64_t ram_budget, uint64_t num_threads); /** * Merges together num_chunks number of intermediate chunks, using the @@ -254,9 +254,21 @@ void forward_index::create_index(const cpptoml::table& config, impl_->load_labels(docs.size()); + auto max_threads = std::thread::hardware_concurrency(); + auto num_threads = static_cast( + config.get_as("indexer-num-threads") + .value_or(max_threads)); + if (num_threads > max_threads) + { + num_threads = max_threads; + LOG(warning) << "Reducing indexer-num-threads to the hardware " + "concurrency level of " + << max_threads << ENDLG; + } + // RAM budget is given in MB fwd_impl_->tokenize_docs(docs, mdata_writer, - ram_budget * 1024 * 1024); + ram_budget * 1024 * 1024, num_threads); impl_->load_term_id_mapping(); impl_->save_label_id_mapping(); fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); @@ -282,7 +294,8 @@ void forward_index::create_index(const cpptoml::table& config, void forward_index::impl::tokenize_docs(corpus::corpus& docs, metadata_writer& mdata_writer, - uint64_t ram_budget) + uint64_t ram_budget, + uint64_t num_threads) { std::mutex io_mutex; std::mutex corpus_mutex; @@ -365,8 +378,7 @@ void forward_index::impl::tokenize_docs(corpus::corpus& docs, } }; - parallel::thread_pool pool; - auto num_threads = pool.thread_ids().size(); + parallel::thread_pool pool{num_threads}; std::vector> futures; futures.reserve(num_threads); for (size_t i = 0; i < num_threads; ++i) From 9061d7bef9edaf3add9e1cb9a5224fb8a02c1166 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 19 Sep 2016 23:31:44 -0500 Subject: [PATCH 04/16] Add horrible hack workaround for remove_all on Windows. On Windows, ::DeleteFile() only "marks a file for deletion on close" and therefore "the file deletion does not occur until the last handle to the file is closed". This means that, in some cases, the call to remove_directory in remove_all can fail if another process (like the Windows search indexer or a virus scanner) happens to still have one of the files in this directory that we deleted above open when we attempt to remove it. As a workaround, we'll attempt to remove a directory a maximum of three times, sleeping for 100ms more between each successive try. If removing the directory fails three times in a row, we will throw an exception and bail out. I really wish there were a better workaround than this, but I can't come up with anything and we want remove_all() to have the same (sane) semantics as it would have on Unix platforms. --- CHANGELOG.md | 18 ++++++++++++++++++ src/io/filesystem.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a0ff7e16..b5b8987e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,21 @@ +# [Unreleased][unreleased] +## New features +- Add an `embedding_analyzer` that represents documents with their averaged word + vectors. +- Add a `parallel::reduction` algorithm designed for parallelizing complex + accumulation operations (like an E step in an EM algorithm) +- Parallelize feature counting in feature selector using the new + `parallel::reduction` + +## Bug Fixes +- Properly shuffle documents when doing an even-split classification test +- Make forward indexer listen to `indexer-num-threads` config option. +- Use correct number of threads when deciding block sizes for + `parallel_for` +- Add workaround to `filesystem::remove_all` for Windows systems to avoid + spurious failures caused by virus scanners keeping files open after we + deleted them + # [v2.4.1][2.4.1] ## Bug fixes - Eliminate excess warnings on Darwin about double preprocessor definitions diff --git a/src/io/filesystem.cpp b/src/io/filesystem.cpp index 6342a0e0d..7e6d007a8 100644 --- a/src/io/filesystem.cpp +++ b/src/io/filesystem.cpp @@ -23,6 +23,13 @@ #include #else + +#ifdef _WIN32 +// chrono and thread for a sleep_for hack in remove_all below +#include +#include +#endif + #include #endif @@ -110,6 +117,39 @@ std::uintmax_t remove_all(const path_type& path) count += remove_all(nextpath); } +#ifdef _WIN32 + // On Windows, ::DeleteFile() only "marks a file for deletion on close" + // and therefore "the file deletion does not occur until the last + // handle to the file is closed". This means that, in some cases, the + // call to remove_directory below can fail if another process (like the + // Windows search indexer or a virus scanner) happens to still have one + // of the files in this directory that we deleted above open when we + // attempt to remove it. + // + // As a workaround, we'll attempt to remove a directory a maximum of + // three times, sleeping for 100ms more between each successive try. If + // removing the directory fails three times in a row, we will throw an + // exception and bail out. I really wish there were a better workaround + // than this, but I can't come up with anything and we want + // remove_all() to have the same (sane) semantics as it would have on + // Unix platforms. + std::chrono::milliseconds delay(0); + for (int i = 0; i < 3; ++i) + { + if (traits::remove_directory(path.c_str())) + { + count += 1; + return count; + } + delay += std::chrono::milliseconds(100); + std::this_thread::sleep_for(delay); + } + + // failed too many times + std::string error = "failed to recursively delete path "; + error += path.c_str(); + throw filesystem_exception{error}; +#else if (!traits::remove_directory(path.c_str())) { std::string error = "failed to recursively delete path "; @@ -119,6 +159,7 @@ std::uintmax_t remove_all(const path_type& path) count += 1; return count; +#endif } } From ae5bb2ba1e3df118fde41c99cad35679e6e4631f Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 21 Sep 2016 12:59:51 -0500 Subject: [PATCH 05/16] Fix invalid memory access in gzstreambuf::underflow(). Fixes issue #156. --- CHANGELOG.md | 1 + src/io/gzstream.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5b8987e3..521fcce95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - Add workaround to `filesystem::remove_all` for Windows systems to avoid spurious failures caused by virus scanners keeping files open after we deleted them +- Fix invalid memory access in `gzstreambuf::underflow` # [v2.4.1][2.4.1] ## Bug fixes diff --git a/src/io/gzstream.cpp b/src/io/gzstream.cpp index da2ef5df9..735046482 100644 --- a/src/io/gzstream.cpp +++ b/src/io/gzstream.cpp @@ -44,7 +44,7 @@ auto gzstreambuf::underflow() -> int_type return traits_type::eof(); } - setg(&buffer_[0], &buffer_[0], &buffer_[static_cast(bytes)]); + setg(&buffer_[0], &buffer_[0], &buffer_[0] + bytes); return traits_type::to_int_type(*gptr()); } From 5f9caf97a6f131932cc85666e95f8a46cbf06e73 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Sat, 17 Sep 2016 23:06:49 -0500 Subject: [PATCH 06/16] Fix bug in parallel_for in deciding block sizes for non-default pools. We were using std::thread::hardware_concurrency() as the number of threads when picking block sizes, but thread_pools can have thread counts less than that. --- include/meta/parallel/parallel_for.h | 9 +++++---- include/meta/parallel/thread_pool.h | 8 ++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/meta/parallel/parallel_for.h b/include/meta/parallel/parallel_for.h index 3d253fac1..17240eeaf 100644 --- a/include/meta/parallel/parallel_for.h +++ b/include/meta/parallel/parallel_for.h @@ -47,14 +47,15 @@ template void parallel_for(Iterator begin, Iterator end, thread_pool& pool, Function func) { - auto block_size - = std::distance(begin, end) / std::thread::hardware_concurrency(); + using difference_type = + typename std::iterator_traits::difference_type; + auto pool_size = static_cast(pool.size()); + auto block_size = std::distance(begin, end) / pool_size; Iterator last = begin; if (block_size > 0) { - std::advance(last, - (std::thread::hardware_concurrency() - 1) * block_size); + std::advance(last, (pool_size - 1) * block_size); } else { diff --git a/include/meta/parallel/thread_pool.h b/include/meta/parallel/thread_pool.h index f221ab5eb..b7cf6e278 100644 --- a/include/meta/parallel/thread_pool.h +++ b/include/meta/parallel/thread_pool.h @@ -105,6 +105,14 @@ class thread_pool return tasks_.size(); } + /** + * @return the number of threads in the pool + */ + size_t size() const + { + return threads_.size(); + } + private: /** * A generic task object. From 8e32a503c5076f16184afec2f02170ddd2b005b0 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 21 Sep 2016 15:56:50 -0500 Subject: [PATCH 07/16] Add CMake export() calls for consuming MeTA without install. --- CMakeLists.txt | 5 +++++ deps/cpptoml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18d59e9cb..65403a5bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,3 +174,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/MeTA/MeTAConfig.cmake DESTINATION lib/cmake/MeTA) + +# allow consumption of a build directory as an "installed" version +export(EXPORT meta-exports + FILE ${CMAKE_CURRENT_BINARY_DIR}/MeTA/MeTATargets.cmake) +export(PACKAGE MeTA) diff --git a/deps/cpptoml b/deps/cpptoml index 36e78ccad..82effa785 160000 --- a/deps/cpptoml +++ b/deps/cpptoml @@ -1 +1 @@ -Subproject commit 36e78ccad3d650505116e93f2075463ba2a85491 +Subproject commit 82effa785000b97510cf83462a65af40519b1b42 From ccb7df1baa80d90e7babe95148faed9376eb5557 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 21 Sep 2016 16:31:42 -0500 Subject: [PATCH 08/16] Rename MeTAConfig.cmake to MeTAConfig.cmake.in. For some reason, lacking the .in suffix can cause CMake to erroneously find this file instead of the one in the build directory when consuming a build directory via the user package repository in CMake (but only on some systems). --- CMakeLists.txt | 2 +- cmake/{MeTAConfig.cmake => MeTAConfig.cmake.in} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cmake/{MeTAConfig.cmake => MeTAConfig.cmake.in} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 65403a5bb..a510a04aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,7 +161,7 @@ install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/meta/ configure_file(cmake/MeTAConfigVersion.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/MeTA/MeTAConfigVersion.cmake @ONLY) -configure_file(cmake/MeTAConfig.cmake +configure_file(cmake/MeTAConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/MeTA/MeTAConfig.cmake COPYONLY) diff --git a/cmake/MeTAConfig.cmake b/cmake/MeTAConfig.cmake.in similarity index 100% rename from cmake/MeTAConfig.cmake rename to cmake/MeTAConfig.cmake.in From df6a99da88a0412a021b9c507eb8782a2d87cfb7 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 22 Sep 2016 11:31:10 -0500 Subject: [PATCH 09/16] Bump cpptoml version. --- deps/cpptoml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/cpptoml b/deps/cpptoml index 82effa785..4fd49e3f5 160000 --- a/deps/cpptoml +++ b/deps/cpptoml @@ -1 +1 @@ -Subproject commit 82effa785000b97510cf83462a65af40519b1b42 +Subproject commit 4fd49e3f5c4fa00467ad478b12ad2189d881a27a From 718878746bade0cf10336fc783d69df6d1759a7c Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 21 Sep 2016 21:55:04 -0500 Subject: [PATCH 10/16] Bump meta-cmake version for detecting libc++experimental. --- deps/meta-cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/meta-cmake b/deps/meta-cmake index 1f1c7df44..f21b5fc5b 160000 --- a/deps/meta-cmake +++ b/deps/meta-cmake @@ -1 +1 @@ -Subproject commit 1f1c7df44473361cd7c1c38b36280f7f5aca3036 +Subproject commit f21b5fc5b9ad678bcf1fc7af8244e03147ed8a68 From 6c5daea06da95aeac83fdfeff0c766ad8288016e Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 22 Sep 2016 11:34:23 -0500 Subject: [PATCH 11/16] Update CHANGELOG for 2.4.2 patch release. --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 521fcce95..9a31dab32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Parallelize feature counting in feature selector using the new `parallel::reduction` +# [v2.4.2][2.4.2] ## Bug Fixes - Properly shuffle documents when doing an even-split classification test - Make forward indexer listen to `indexer-num-threads` config option. @@ -501,7 +502,8 @@ # [v1.0][1.0] - Initial release. -[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.4.1...develop +[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.4.2...develop +[2.4.2]: https://github.com/meta-toolkit/meta/compare/v2.4.1...v2.4.2 [2.4.1]: https://github.com/meta-toolkit/meta/compare/v2.4.0...v2.4.1 [2.4.0]: https://github.com/meta-toolkit/meta/compare/v2.3.0...v2.4.0 [2.3.0]: https://github.com/meta-toolkit/meta/compare/v2.2.0...v2.3.0 From 29209dd5ecd61de1be5e5cb9698b85bb2db7a975 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 22 Sep 2016 11:36:48 -0500 Subject: [PATCH 12/16] Remove unreleased section from CHANGELOG for 2.4.2 release. --- CHANGELOG.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a31dab32..1a5c4cf69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,3 @@ -# [Unreleased][unreleased] -## New features -- Add an `embedding_analyzer` that represents documents with their averaged word - vectors. -- Add a `parallel::reduction` algorithm designed for parallelizing complex - accumulation operations (like an E step in an EM algorithm) -- Parallelize feature counting in feature selector using the new - `parallel::reduction` - # [v2.4.2][2.4.2] ## Bug Fixes - Properly shuffle documents when doing an even-split classification test From cce942638b1a8fb54e34669a8739f1c8f95b9054 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 19 Sep 2016 23:43:05 -0500 Subject: [PATCH 13/16] Use OS X 10.10 for OS X + GCC build on Travis CI. This works around the build erroring out due to homebrew-versions not having a bottle made for OS X 10.9 for GCC 6.2.0. --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index c58c2c9a3..a04447d3d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -99,6 +99,7 @@ matrix: # OS X/GCC 6 - os: osx + osx_image: xcode6.4 env: COMPILER=gcc install: From 91e5f5c3cf240ae3a2b7dc5168634a3ea6e4e1ff Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 20 Sep 2016 01:06:09 -0500 Subject: [PATCH 14/16] Use Xcode 7.1 image on Travis OS X for GCC. The Xcode 6 image seems to make GCC unhappy. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a04447d3d..d4e8e71ea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -99,7 +99,7 @@ matrix: # OS X/GCC 6 - os: osx - osx_image: xcode6.4 + osx_image: xcode7.1 env: COMPILER=gcc install: From bee6912e1b7b3677a9d93a19b31e54baeca7ca60 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 22 Sep 2016 12:41:13 -0500 Subject: [PATCH 15/16] Bump patch version number in CMakeLists.txt. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a510a04aa..b314bebdb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS 1) set(MeTA_VERSION_MAJOR 2) set(MeTA_VERSION_MINOR 4) -set(MeTA_VERSION_PATCH 0) +set(MeTA_VERSION_PATCH 2) set(MeTA_VERSION "${MeTA_VERSION_MAJOR}.${MeTA_VERSION_MINOR}.${MeTA_VERSION_PATCH}") From b86d6cd363b1c409a0577199d3c823301c4a29fc Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 22 Sep 2016 15:32:17 -0500 Subject: [PATCH 16/16] Use Xcode 7.3 image for macOS + GCC CI build on Travis. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d4e8e71ea..054b00c3a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -99,7 +99,7 @@ matrix: # OS X/GCC 6 - os: osx - osx_image: xcode7.1 + osx_image: xcode7.3 env: COMPILER=gcc install: