Skip to content

Commit

Permalink
Merge branch 'patch-2.4.2' for MeTA v2.4.2
Browse files Browse the repository at this point in the history
  • Loading branch information
skystrife committed Sep 23, 2016
2 parents ed203f2 + b86d6cd commit e09ac0e
Show file tree
Hide file tree
Showing 13 changed files with 113 additions and 34 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ matrix:

# OS X/GCC 6
- os: osx
osx_image: xcode7.3
env: COMPILER=gcc

install:
Expand Down
14 changes: 13 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
# [v2.4.2][2.4.2]
## Bug Fixes
- Properly shuffle documents when doing an even-split classification test
- Make forward indexer listen to `indexer-num-threads` config option.
- Use correct number of threads when deciding block sizes for
`parallel_for`
- Add workaround to `filesystem::remove_all` for Windows systems to avoid
spurious failures caused by virus scanners keeping files open after we
deleted them
- Fix invalid memory access in `gzstreambuf::underflow`

# [v2.4.1][2.4.1]
## Bug fixes
- Eliminate excess warnings on Darwin about double preprocessor definitions
Expand Down Expand Up @@ -482,7 +493,8 @@
# [v1.0][1.0]
- Initial release.

[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.4.1...develop
[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.4.2...develop
[2.4.2]: https://github.com/meta-toolkit/meta/compare/v2.4.1...v2.4.2
[2.4.1]: https://github.com/meta-toolkit/meta/compare/v2.4.0...v2.4.1
[2.4.0]: https://github.com/meta-toolkit/meta/compare/v2.3.0...v2.4.0
[2.3.0]: https://github.com/meta-toolkit/meta/compare/v2.2.0...v2.3.0
Expand Down
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS 1)

set(MeTA_VERSION_MAJOR 2)
set(MeTA_VERSION_MINOR 4)
set(MeTA_VERSION_PATCH 0)
set(MeTA_VERSION_PATCH 2)
set(MeTA_VERSION
"${MeTA_VERSION_MAJOR}.${MeTA_VERSION_MINOR}.${MeTA_VERSION_PATCH}")

Expand Down Expand Up @@ -161,7 +161,7 @@ install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/meta/
configure_file(cmake/MeTAConfigVersion.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/MeTA/MeTAConfigVersion.cmake
@ONLY)
configure_file(cmake/MeTAConfig.cmake
configure_file(cmake/MeTAConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/MeTA/MeTAConfig.cmake
COPYONLY)

Expand All @@ -174,3 +174,8 @@ install(FILES
${CMAKE_CURRENT_BINARY_DIR}/MeTA/MeTAConfig.cmake
DESTINATION
lib/cmake/MeTA)

# allow consumption of a build directory as an "installed" version
export(EXPORT meta-exports
FILE ${CMAKE_CURRENT_BINARY_DIR}/MeTA/MeTATargets.cmake)
export(PACKAGE MeTA)
File renamed without changes.
2 changes: 1 addition & 1 deletion deps/cpptoml
2 changes: 1 addition & 1 deletion deps/meta-cmake
6 changes: 4 additions & 2 deletions include/meta/classify/classifier/classifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,13 @@ confusion_matrix cross_validate(Creator&& creator,
bool even_split = false)
{
using diff_type = decltype(docs.begin())::difference_type;
// docs might be ordered by class, so make sure things are shuffled
docs.shuffle();

if (even_split)
docs = docs.create_even_split();

// docs might be ordered by class, so make sure things are shuffled
docs.shuffle();

confusion_matrix matrix;
auto step_size = docs.size() / k;
for (size_t i = 0; i < k; ++i)
Expand Down
9 changes: 5 additions & 4 deletions include/meta/parallel/parallel_for.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,15 @@ template <class Iterator, class Function>
void parallel_for(Iterator begin, Iterator end, thread_pool& pool,
Function func)
{
auto block_size
= std::distance(begin, end) / std::thread::hardware_concurrency();
using difference_type =
typename std::iterator_traits<Iterator>::difference_type;
auto pool_size = static_cast<difference_type>(pool.size());
auto block_size = std::distance(begin, end) / pool_size;

Iterator last = begin;
if (block_size > 0)
{
std::advance(last,
(std::thread::hardware_concurrency() - 1) * block_size);
std::advance(last, (pool_size - 1) * block_size);
}
else
{
Expand Down
8 changes: 8 additions & 0 deletions include/meta/parallel/thread_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ class thread_pool
return tasks_.size();
}

/**
* @return the number of threads in the pool
*/
size_t size() const
{
return threads_.size();
}

private:
/**
* A generic task object.
Expand Down
31 changes: 14 additions & 17 deletions src/classify/tools/classify.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
/**
* @file classify-test.cpp
* @file classify.cpp
* @author Sean Massung
* @author Chase Geigle
*/

#include <functional>
Expand All @@ -17,22 +19,17 @@
#include "meta/util/progress.h"
#include "meta/util/time.h"

using std::cout;
using std::cerr;
using std::endl;
using namespace meta;

template <class Creator>
classify::confusion_matrix cv(Creator&& creator,
classify::multiclass_dataset_view docs, bool even)
{
classify::confusion_matrix matrix;
auto msec = common::time(
[&]()
{
matrix = classify::cross_validate(std::forward<Creator>(creator),
docs, 5, even);
});
auto msec = common::time([&]() {
matrix = classify::cross_validate(std::forward<Creator>(creator), docs,
5, even);
});
std::cerr << "time elapsed: " << msec.count() / 1000.0 << "s" << std::endl;
matrix.print();
matrix.print_stats();
Expand Down Expand Up @@ -67,7 +64,7 @@ int main(int argc, char* argv[])
{
if (argc != 2)
{
cerr << "Usage:\t" << argv[0] << " config.toml" << endl;
std::cerr << "Usage:\t" << argv[0] << " config.toml" << std::endl;
return 1;
}

Expand All @@ -81,7 +78,8 @@ int main(int argc, char* argv[])
auto class_config = config->get_table("classifier");
if (!class_config)
{
cerr << "Missing classifier configuration group in " << argv[1] << endl;
std::cerr << "Missing classifier configuration group in " << argv[1]
<< std::endl;
return 1;
}

Expand All @@ -90,23 +88,22 @@ int main(int argc, char* argv[])
classify::multiclass_dataset dataset{f_idx};

std::function<std::unique_ptr<classify::classifier>(
classify::multiclass_dataset_view)> creator;
classify::multiclass_dataset_view)>
creator;
auto classifier_method = *class_config->get_as<std::string>("method");
auto even = class_config->get_as<bool>("even-split").value_or(false);
if (classifier_method == "knn" || classifier_method == "nearest-centroid")
{
auto i_idx = index::make_index<index::inverted_index>(*config);
creator = [=](classify::multiclass_dataset_view fold)
{
creator = [=](classify::multiclass_dataset_view fold) {
return classify::make_classifier(*class_config, std::move(fold),
i_idx);
};
}
else
{

creator = [&](classify::multiclass_dataset_view fold)
{
creator = [&](classify::multiclass_dataset_view fold) {
return classify::make_classifier(*class_config, std::move(fold));
};
}
Expand Down
22 changes: 17 additions & 5 deletions src/index/forward_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class forward_index::impl
* merged.
*/
void tokenize_docs(corpus::corpus& corpus, metadata_writer& mdata_writer,
uint64_t ram_budget);
uint64_t ram_budget, uint64_t num_threads);

/**
* Merges together num_chunks number of intermediate chunks, using the
Expand Down Expand Up @@ -254,9 +254,21 @@ void forward_index::create_index(const cpptoml::table& config,

impl_->load_labels(docs.size());

auto max_threads = std::thread::hardware_concurrency();
auto num_threads = static_cast<unsigned>(
config.get_as<int64_t>("indexer-num-threads")
.value_or(max_threads));
if (num_threads > max_threads)
{
num_threads = max_threads;
LOG(warning) << "Reducing indexer-num-threads to the hardware "
"concurrency level of "
<< max_threads << ENDLG;
}

// RAM budget is given in MB
fwd_impl_->tokenize_docs(docs, mdata_writer,
ram_budget * 1024 * 1024);
ram_budget * 1024 * 1024, num_threads);
impl_->load_term_id_mapping();
impl_->save_label_id_mapping();
fwd_impl_->total_unique_terms_ = impl_->total_unique_terms();
Expand All @@ -282,7 +294,8 @@ void forward_index::create_index(const cpptoml::table& config,

void forward_index::impl::tokenize_docs(corpus::corpus& docs,
metadata_writer& mdata_writer,
uint64_t ram_budget)
uint64_t ram_budget,
uint64_t num_threads)
{
std::mutex io_mutex;
std::mutex corpus_mutex;
Expand Down Expand Up @@ -365,8 +378,7 @@ void forward_index::impl::tokenize_docs(corpus::corpus& docs,
}
};

parallel::thread_pool pool;
auto num_threads = pool.thread_ids().size();
parallel::thread_pool pool{num_threads};
std::vector<std::future<void>> futures;
futures.reserve(num_threads);
for (size_t i = 0; i < num_threads; ++i)
Expand Down
41 changes: 41 additions & 0 deletions src/io/filesystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@

#include <platformstl/filesystem/directory_functions.hpp>
#else

#ifdef _WIN32
// chrono and thread for a sleep_for hack in remove_all below
#include <chrono>
#include <thread>
#endif

#include <experimental/filesystem>
#endif

Expand Down Expand Up @@ -110,6 +117,39 @@ std::uintmax_t remove_all(const path_type& path)
count += remove_all(nextpath);
}

#ifdef _WIN32
// On Windows, ::DeleteFile() only "marks a file for deletion on close"
// and therefore "the file deletion does not occur until the last
// handle to the file is closed". This means that, in some cases, the
// call to remove_directory below can fail if another process (like the
// Windows search indexer or a virus scanner) happens to still have one
// of the files in this directory that we deleted above open when we
// attempt to remove it.
//
// As a workaround, we'll attempt to remove a directory a maximum of
// three times, sleeping for 100ms more between each successive try. If
// removing the directory fails three times in a row, we will throw an
// exception and bail out. I really wish there were a better workaround
// than this, but I can't come up with anything and we want
// remove_all() to have the same (sane) semantics as it would have on
// Unix platforms.
std::chrono::milliseconds delay(0);
for (int i = 0; i < 3; ++i)
{
if (traits::remove_directory(path.c_str()))
{
count += 1;
return count;
}
delay += std::chrono::milliseconds(100);
std::this_thread::sleep_for(delay);
}

// failed too many times
std::string error = "failed to recursively delete path ";
error += path.c_str();
throw filesystem_exception{error};
#else
if (!traits::remove_directory(path.c_str()))
{
std::string error = "failed to recursively delete path ";
Expand All @@ -119,6 +159,7 @@ std::uintmax_t remove_all(const path_type& path)

count += 1;
return count;
#endif
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/io/gzstream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ auto gzstreambuf::underflow() -> int_type
return traits_type::eof();
}

setg(&buffer_[0], &buffer_[0], &buffer_[static_cast<std::size_t>(bytes)]);
setg(&buffer_[0], &buffer_[0], &buffer_[0] + bytes);

return traits_type::to_int_type(*gptr());
}
Expand Down

0 comments on commit e09ac0e

Please sign in to comment.